def pos_tag(inp: str, out, tagdir: str = '/usr/local/tree-tagger'): #Generates POS representation of data and pickles output into a jar. texts, genders, ages = read_data(inp) tagger = treetaggerwrapper.TreeTagger(TAGLANG='en', TAGDIR=tagdir, TAGOPT='-token -sgml') logging.info('POS tagging data') pos_texts = [] d_infothresholds = { int((i / 100.0 * len(texts))): "%i%%" % (i) for i in range(0, 101) } for i, t in enumerate(texts): tags = [ el.split('\t')[1] for el in tagger.tag_text(t) if len(el.split()) == 2 ] pos_texts.append(' '.join(tags)) if i in d_infothresholds.keys(): logging.info('{} of documents processed'.format( d_infothresholds[i])) logging.info('Pickling results to {}'.format(out.name)) pickle.dump((pos_texts, genders, ages), out)
def pos_tag(inp:str, out, tagdir:str='/usr/local/tree-tagger'): #Generates POS representation of data and pickles output into a jar. texts, genders, ages = read_data(inp) tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',TAGDIR=tagdir, TAGOPT='-token -sgml') logging.info('POS tagging data') pos_texts = [] d_infothresholds = {int((i/100.0*len(texts))):"%i%%"%(i) for i in range(0, 101)} for i, t in enumerate(texts): tags = [el.split('\t')[1] for el in tagger.tag_text(t) if len(el.split()) == 2] pos_texts.append(' '.join(tags)) if i in d_infothresholds.keys(): logging.info('{} of documents processed'.format(d_infothresholds[i])) logging.info('Pickling results to {}'.format(out.name)) pickle.dump((pos_texts, genders, ages), out)
def extract_feats(data:str, feat_name:str, outpath:str, pos:str=False, **kwargs): """Extracts features from input data and writes them to disk """ # input correction if outpath[-1] != '/': path = outpath + '/' else: path = outpath if pos and data == DEFAULT_TOKEN_JAR: src = DEFAULT_POS_JAR else: src = data texts, genders, ages = read_data(src) vect = FEAT_DICT[feat_name](**kwargs) if feat_name == 'ngram': logging.info('Generating ngram feature vectors (range:{}, max_features:{}, min_df:{}, max_df:{})'.format(kwargs['ngram_range'], str(kwargs['max_features']), str(kwargs['min_df']), str(kwargs['max_df']))) else: logging.info('Generating {} feature vectors'.format(feat_name)) feats = vect.fit_transform(texts) logging.info('Writing feature matrix to disk') if feat_name == 'ngram': if pos: feat = 'pos_' + feat_name else: feat = feat_name fn = 'feat_matr_{}_n{}_{}'.format(feat, '-'.join(str(n) for n in kwargs['ngram_range']), datetime.now().strftime('%m%d_%H%M%S')) else: fn = 'feat_matr_{}_{}'.format(feat_name, datetime.now().strftime('%m%d_%H%M%S')) np.save(path + fn, feats) # write parameters (if any) to log file to avoid ridiculous file names if kwargs: with open(path + 'feats.log', 'a') as op: op.write('\n{} {} {}\t{}'.format(datetime.now().strftime('%d-%m-%y %H:%M:%S'), fn, feat_name, src)) for key, val in sorted(kwargs.items()): op.write(' {}:{}'.format(key,val))