예제 #1
0
def get_GermEval2017_TaskB_data(
        data_file='some-path/GermEval2017/train-2017-09-15.tsv', limit=np.Inf):
    '''
    download data from: https://sites.google.com/view/germeval2017-absa/data
    '''
    def process_line(line):
        split = line.split('\t')
        url = split[0]
        text = split[1]
        relevance = split[2]
        sentiment = split[3]
        datum = {'text': text, 'relevance': relevance, 'sentiment': sentiment}
        if len(split) > 4:
            aspect_Polarity = split[4]
            datum['aspect_polarity'] = aspect_Polarity
        return datum

    data = []
    for line in data_io.read_lines(data_file, limit=limit):
        try:
            datum = process_line(line)
        except:
            print(line)
            continue
        data.append(datum)

    return data
예제 #2
0
def collect_annotations_write_to_table(brat_path):
    anno_files = [brat_path + '/' + f for f in os.listdir(brat_path) if f.endswith('.ann')]
    eids_file_annolines = [(os.path.split(file)[1].replace('.ann', ''),file, list(data_io.read_lines(file)))
                      for file in anno_files]
    eids2annolines_to_collect = {eid:(file,anno_lines) for eid,file,anno_lines in eids_file_annolines if any([DONE_ANNO in line for line in anno_lines])}
    print('found %d ann-files to collect'%len(eids2annolines_to_collect.keys()))
    query = select([table]).where(table.c.id.in_([json.dumps(eid) for eid in eids2annolines_to_collect.keys()]))

    def process_batch_fun(batch):
        batch = [row_to_dict(d) for d in batch]
        def process_doc(doc:Dict):
            file,anno_lines = eids2annolines_to_collect[doc['id']]
            anno = parse_anno_lines(anno_lines, doc['sentences'])
            new_ner = join_all_ner_annotations(anno['ner'], doc['sentences'])
            ner_anno = overwrite_ner_annotations(doc['ner'], new_ner, annotator_human)
            return ner_anno
        return [{'id':json.dumps(d['id']),'ner':json.dumps(process_doc(d))}  for d in batch]

    process_table_batchwise(sqlalchemy_engine, query, table, process_batch_fun)
    files_to_remove = [file for file,_ in eids2annolines_to_collect.values()]
    for file in files_to_remove:
        os.remove(file)
        os.remove(file.replace('.ann', '.txt'))

    return [f for f in anno_files if f not in files_to_remove]
예제 #3
0
def get_MRPC_data(file):
    def parse_line(line):
        label, id1, id2, texta, textb = line.split('\t')
        return {'text': texta, 'textb': textb, 'labels': label}

    lines_g = data_io.read_lines(file)
    next(lines_g)
    data = [parse_line(line) for line in lines_g]
    return data
예제 #4
0
def build_flashtext_trie(file, limit=numpy.Inf):
    id2phrases = {}

    def process_line(line):
        line = line.replace('\n', '')
        if '\t' in line:
            id, phrase = line.split('\t')
            if id in id2phrases:
                id2phrases[id].append(phrase)
            else:
                id2phrases[id] = [phrase]
        else:
            phrase = line
            id2phrases[len(id2phrases) + 1] = [phrase]

    [process_line(line) for line in data_io.read_lines(file, limit=limit)]
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_dict(id2phrases)
    # print(keyword_processor.extract_keywords('gesellschaftervertrag'))
    return keyword_processor
예제 #5
0
def unittest_parse_brat_annotations():
    # ip = '10.1.1.29'
    ip = 'localhost'
    sqlalchemy_base, sqlalchemy_engine = get_sqlalchemy_base_engine(host=ip)
    table = get_tables_by_reflection(sqlalchemy_base.metadata,
                                     sqlalchemy_engine)['scierc']
    brat_path = './brat_configurations'
    # write_brat_annotations(select([table]).limit(3), brat_path, sqlalchemy_engine)
    for d in sqlalchemy_engine.execute(select([table]).limit(3)):
        doc = row_to_dict(d)
        ann_file = write_brat_annotation(doc, brat_path)
        _, _, tok2sent_id = spaced_tokens_and_tokenoffset2charoffset(
            doc['sentences'])

        anno = parse_anno_lines(data_io.read_lines(ann_file), doc['sentences'])

        assert (all([
            s1 == s2 and e1 == e2 and l1 == l2 and a1 == a2
            for (a1, sents1), (
                a2, sents2) in zip(doc['ner'].items(), anno['ner'].items())
            for x, y in zip(sents1, sents2)
            for (s1, e1, l1), (s2, e2, l2) in zip(x, y)
        ]))
예제 #6
0
def get_Clef2019_data(data_path):
    return [
        parse_line(f, l) for f in os.listdir(data_path)
        for l in data_io.read_lines(os.path.join(data_path, f))
    ]
예제 #7
0
    page_soup = BeautifulSoup(uClient.read(), "html.parser")
    uClient.close()
    pattern = re.compile('http://www.genderopen\.de.{1,400}\.pdf')
    page_str = str(page_soup)
    filenames = pattern.findall(page_str)
    return filenames


if __name__ == '__main__':
    data_dir = '/home/tilo/data/gender_open_rep_pdfs'
    if not os.path.isdir(data_dir):
        os.mkdir(data_dir)

    urls_file = data_dir + '/urls.txt'
    if os.path.isfile(urls_file):
        already_downloaded_urls = list(data_io.read_lines(urls_file))
    else:
        already_downloaded_urls = []

    for query_keyword in ['Geschlecht', 'Eine', 'theorie',
                          'theory']:  #'women','Frau'
        for page_number in range(1, 200000):
            while True:
                try:
                    filenames = get_pdf_urls(query_keyword, page_number)
                    break
                except Exception:
                    print('retrying query: %s; page: %d' %
                          (query_keyword, page_number))
                    time.sleep(5)
예제 #8
0
        try:
            d = {k: convert_types(v, t) for k, v, t in zip(cols, s, types)}
        except Exception:
            d = {k: v for k, v, t in zip(cols, s, types)}
            raise Exception('error parsing: %s' % str(d))
        return d

    data_g = (line_to_dict(line) for line in line_g)
    with sqlalchemy_engine.connect() as conn:
        insert_in_table(conn, table, data_g, batch_size=1000_000)


if __name__ == '__main__':
    data_path = '/docker-share/data/MAG/'
    for schema in [Author]:
        table = schema.__table__
        table_name = schema.__tablename__
        print('populating: %s' % table_name)
        lines_g = data_io.read_lines(data_path +
                                     '%s.txt.gz' % table_name.capitalize())
        skip_numrows(table, lines_g)
        g = tqdm(lines_g)
        populate_table(table, g)
'''

number of papers: 214100980 (zcat /docker-share/data/MAG/Papers.txt.gz | wc -l)

populating: papers took ~15 hours!!
skipping: 7_610_000 rows took: 0.54
206_402_980 it [14:49:40, 3866.65it/s] -> why so slow?
'''
예제 #9
0
    # lines_g = data_io.read_lines_from_files(data_path)
    lines_g = build_opensubtitles_lines_generator(data_source)

    corpus_path = '%s/de_corpus' % path
    # corpus_path = '%s/corpus' % '/tmp'
    build_train_valid_test_files = True
    if build_train_valid_test_files:
        if os.path.isdir(corpus_path):
            shutil.rmtree(corpus_path)
        if not os.path.exists(corpus_path): os.mkdir(corpus_path)
        num_train_docs = 20_000
        train_file = '%s/train.txt'% corpus_path
        data_io.write_to_file(train_file, (next(lines_g) for k in range(num_train_docs)))
        train_split_folder = corpus_path + '/train'
        if not os.path.exists(train_split_folder): os.mkdir(train_split_folder)
        train_lines_g = data_io.read_lines(train_file)
        num_train_docs = sum((1 for line in data_io.read_lines(train_file)))
        lines_per_split = 100
        num_train_splits = int(numpy.ceil(num_train_docs / lines_per_split))
        for k in range(num_train_splits):
            split_file = train_split_folder + '/train_split_%d'%k
            data_io.write_to_file(split_file, (next(train_lines_g) for k in range(lines_per_split)))

        # shutil.copy(train_file, train_split_folder + '/train_split_%d'%k)
        data_io.write_to_file('%s/valid.txt' % corpus_path, (next(lines_g) for k in range(lines_per_split)))
        data_io.write_to_file('%s/test.txt' % corpus_path, (next(lines_g) for k in range(lines_per_split)))

    is_forward_lm = True
    dictionary: Dictionary = Dictionary.load('chars')

    corpus = TextCorpus(corpus_path,