def get_GermEval2017_TaskB_data( data_file='some-path/GermEval2017/train-2017-09-15.tsv', limit=np.Inf): ''' download data from: https://sites.google.com/view/germeval2017-absa/data ''' def process_line(line): split = line.split('\t') url = split[0] text = split[1] relevance = split[2] sentiment = split[3] datum = {'text': text, 'relevance': relevance, 'sentiment': sentiment} if len(split) > 4: aspect_Polarity = split[4] datum['aspect_polarity'] = aspect_Polarity return datum data = [] for line in data_io.read_lines(data_file, limit=limit): try: datum = process_line(line) except: print(line) continue data.append(datum) return data
def collect_annotations_write_to_table(brat_path): anno_files = [brat_path + '/' + f for f in os.listdir(brat_path) if f.endswith('.ann')] eids_file_annolines = [(os.path.split(file)[1].replace('.ann', ''),file, list(data_io.read_lines(file))) for file in anno_files] eids2annolines_to_collect = {eid:(file,anno_lines) for eid,file,anno_lines in eids_file_annolines if any([DONE_ANNO in line for line in anno_lines])} print('found %d ann-files to collect'%len(eids2annolines_to_collect.keys())) query = select([table]).where(table.c.id.in_([json.dumps(eid) for eid in eids2annolines_to_collect.keys()])) def process_batch_fun(batch): batch = [row_to_dict(d) for d in batch] def process_doc(doc:Dict): file,anno_lines = eids2annolines_to_collect[doc['id']] anno = parse_anno_lines(anno_lines, doc['sentences']) new_ner = join_all_ner_annotations(anno['ner'], doc['sentences']) ner_anno = overwrite_ner_annotations(doc['ner'], new_ner, annotator_human) return ner_anno return [{'id':json.dumps(d['id']),'ner':json.dumps(process_doc(d))} for d in batch] process_table_batchwise(sqlalchemy_engine, query, table, process_batch_fun) files_to_remove = [file for file,_ in eids2annolines_to_collect.values()] for file in files_to_remove: os.remove(file) os.remove(file.replace('.ann', '.txt')) return [f for f in anno_files if f not in files_to_remove]
def get_MRPC_data(file): def parse_line(line): label, id1, id2, texta, textb = line.split('\t') return {'text': texta, 'textb': textb, 'labels': label} lines_g = data_io.read_lines(file) next(lines_g) data = [parse_line(line) for line in lines_g] return data
def build_flashtext_trie(file, limit=numpy.Inf): id2phrases = {} def process_line(line): line = line.replace('\n', '') if '\t' in line: id, phrase = line.split('\t') if id in id2phrases: id2phrases[id].append(phrase) else: id2phrases[id] = [phrase] else: phrase = line id2phrases[len(id2phrases) + 1] = [phrase] [process_line(line) for line in data_io.read_lines(file, limit=limit)] keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(id2phrases) # print(keyword_processor.extract_keywords('gesellschaftervertrag')) return keyword_processor
def unittest_parse_brat_annotations(): # ip = '10.1.1.29' ip = 'localhost' sqlalchemy_base, sqlalchemy_engine = get_sqlalchemy_base_engine(host=ip) table = get_tables_by_reflection(sqlalchemy_base.metadata, sqlalchemy_engine)['scierc'] brat_path = './brat_configurations' # write_brat_annotations(select([table]).limit(3), brat_path, sqlalchemy_engine) for d in sqlalchemy_engine.execute(select([table]).limit(3)): doc = row_to_dict(d) ann_file = write_brat_annotation(doc, brat_path) _, _, tok2sent_id = spaced_tokens_and_tokenoffset2charoffset( doc['sentences']) anno = parse_anno_lines(data_io.read_lines(ann_file), doc['sentences']) assert (all([ s1 == s2 and e1 == e2 and l1 == l2 and a1 == a2 for (a1, sents1), ( a2, sents2) in zip(doc['ner'].items(), anno['ner'].items()) for x, y in zip(sents1, sents2) for (s1, e1, l1), (s2, e2, l2) in zip(x, y) ]))
def get_Clef2019_data(data_path): return [ parse_line(f, l) for f in os.listdir(data_path) for l in data_io.read_lines(os.path.join(data_path, f)) ]
page_soup = BeautifulSoup(uClient.read(), "html.parser") uClient.close() pattern = re.compile('http://www.genderopen\.de.{1,400}\.pdf') page_str = str(page_soup) filenames = pattern.findall(page_str) return filenames if __name__ == '__main__': data_dir = '/home/tilo/data/gender_open_rep_pdfs' if not os.path.isdir(data_dir): os.mkdir(data_dir) urls_file = data_dir + '/urls.txt' if os.path.isfile(urls_file): already_downloaded_urls = list(data_io.read_lines(urls_file)) else: already_downloaded_urls = [] for query_keyword in ['Geschlecht', 'Eine', 'theorie', 'theory']: #'women','Frau' for page_number in range(1, 200000): while True: try: filenames = get_pdf_urls(query_keyword, page_number) break except Exception: print('retrying query: %s; page: %d' % (query_keyword, page_number)) time.sleep(5)
try: d = {k: convert_types(v, t) for k, v, t in zip(cols, s, types)} except Exception: d = {k: v for k, v, t in zip(cols, s, types)} raise Exception('error parsing: %s' % str(d)) return d data_g = (line_to_dict(line) for line in line_g) with sqlalchemy_engine.connect() as conn: insert_in_table(conn, table, data_g, batch_size=1000_000) if __name__ == '__main__': data_path = '/docker-share/data/MAG/' for schema in [Author]: table = schema.__table__ table_name = schema.__tablename__ print('populating: %s' % table_name) lines_g = data_io.read_lines(data_path + '%s.txt.gz' % table_name.capitalize()) skip_numrows(table, lines_g) g = tqdm(lines_g) populate_table(table, g) ''' number of papers: 214100980 (zcat /docker-share/data/MAG/Papers.txt.gz | wc -l) populating: papers took ~15 hours!! skipping: 7_610_000 rows took: 0.54 206_402_980 it [14:49:40, 3866.65it/s] -> why so slow? '''
# lines_g = data_io.read_lines_from_files(data_path) lines_g = build_opensubtitles_lines_generator(data_source) corpus_path = '%s/de_corpus' % path # corpus_path = '%s/corpus' % '/tmp' build_train_valid_test_files = True if build_train_valid_test_files: if os.path.isdir(corpus_path): shutil.rmtree(corpus_path) if not os.path.exists(corpus_path): os.mkdir(corpus_path) num_train_docs = 20_000 train_file = '%s/train.txt'% corpus_path data_io.write_to_file(train_file, (next(lines_g) for k in range(num_train_docs))) train_split_folder = corpus_path + '/train' if not os.path.exists(train_split_folder): os.mkdir(train_split_folder) train_lines_g = data_io.read_lines(train_file) num_train_docs = sum((1 for line in data_io.read_lines(train_file))) lines_per_split = 100 num_train_splits = int(numpy.ceil(num_train_docs / lines_per_split)) for k in range(num_train_splits): split_file = train_split_folder + '/train_split_%d'%k data_io.write_to_file(split_file, (next(train_lines_g) for k in range(lines_per_split))) # shutil.copy(train_file, train_split_folder + '/train_split_%d'%k) data_io.write_to_file('%s/valid.txt' % corpus_path, (next(lines_g) for k in range(lines_per_split))) data_io.write_to_file('%s/test.txt' % corpus_path, (next(lines_g) for k in range(lines_per_split))) is_forward_lm = True dictionary: Dictionary = Dictionary.load('chars') corpus = TextCorpus(corpus_path,