예제 #1
0
def dois_from_scholar_urls():
    sh = SciHub()
    no_doi_found_file = data_path + 'no_doi_found.txt'
    doi_not_in_scihub_file = data_path + 'doi_not_in_scihub.txt'
    no_doi_found = []
    doi_not_in_scihub = []
    shit_counter = 0
    pattern = re.compile('10.\d{4,9}/[-._;()/:A-Z0-9]+$')
    articles = get_artices_from_googlescholar(phrase='emotional geographies',
                                              max_num_hits=100)
    print('num hits %d' % len(articles))
    for art in articles:
        url = art.attrs['url'][0]
        if any([p in url for p in ['https://books.google.com/books']]):
            shit_counter += 1
            continue
        # url = 'https://journals.sagepub.com/doi/abs/10.1177/0891243287001002002'
        dois = [doi for doi in pattern.findall(url)]

        if len(dois) > 0:
            print(dois)
            for doi in dois:
                try:
                    sh.download(doi, destination=data_path)
                    print('got: %s' % doi)
                except:
                    doi_not_in_scihub.append(doi)
        else:
            no_doi_found.append(url)
    data_io.write_to_file(no_doi_found_file, no_doi_found, 'ab')
    data_io.write_to_file(doi_not_in_scihub_file, doi_not_in_scihub, 'ab')
    print('shit: %d' % shit_counter)
예제 #2
0
def write_brat_annotation(doc, path):
    doc_name, text, spans, relations, attributes, notes = build_brat_lines(doc)
    data_io.write_to_file(path + '/' + doc_name + '.txt', [text])
    lines = [span_to_ann_line(d, text) for d in spans]
    lines += [to_rel_ann_line(d) for d in relations]
    lines += [to_attr_ann_line(d) for d in attributes]
    lines += [to_notes_ann_line(d) for d in notes]
    ann_file = path + '/' + doc_name + '.ann'
    data_io.write_to_file(ann_file, lines)
    return ann_file
예제 #3
0
def convert_pdfs_to_text(path_pdfs, path_texts):
    if not os.path.isdir(path_texts):
        os.mkdir(path_texts)

    for i, file in enumerate(os.listdir(path_pdfs)):
        try:
            with open(path_pdfs + '/' + file, "rb") as f:
                pdf = pdftotext.PDF(f)
        except:
            continue
        pages = [page for page in pdf]
        data_io.write_to_file(path_texts + '/' + file.replace('.pdf', '.txt'),
                              pages)
        if i % 100 == 0:
            sys.stdout.write('\r%d' % i)
예제 #4
0
def process_file(file):
    with open(file) as f:
        xml = f.read()
        soup = BeautifulSoup(xml)
        raw_paragraphs = soup.find_all('p')
        refs = soup.find_all('ref')
        formula = soup.find_all('formula')
        list = soup.find_all('list')
        shit = [refs, formula, list]
        paragraphs = [
            ' '.join(
                [str(x) for x in paragraph if not any(x in sh for sh in shit)])
            for paragraph in raw_paragraphs
        ]
        data_io.write_to_file(
            processed_path + '/' +
            os.path.split(file)[1].replace('.xml', '_processed.txt'),
            paragraphs)
예제 #5
0
    nlp = spacy.load('en')

    # Add neural coref to SpaCy's pipe
    import neuralcoref
    neuralcoref.add_to_pipe(nlp)

    path = '/home/tilo/code/misc/clef2019-factchecking-task1/data/training'
    dump_path = '/home/tilo/data/coref'
    debate_lines = get_Clef2019_data(path)
    for k, batch in enumerate(
            iterable_to_batches([d['utterance'] for d in debate_lines],
                                batch_size=32)):
        k += 1
        text = ' '.join(batch)
        doc = nlp(text)
        data_io.write_to_file(dump_path + '/text_%d.txt' % k, [text])
        m_id = [0]

        def get_inc_m_id():
            val = m_id[0]
            m_id[0] += 1
            return val

        standoff_corefs = []
        for cluster_id, cluster in enumerate(doc._.coref_clusters):
            mentions = [(s.start_char, s.end_char, s.text) for s in cluster]
            cluster_mention = (get_inc_m_id(), cluster.main.start_char,
                               cluster.main.end_char, cluster.main.text)
            mentions = [
                (get_inc_m_id(), start, end, text)
                for start, end, text in mentions if text != cluster.main.text
예제 #6
0
                            retmax=1000000)
    search_results = Entrez.read(handle)
    ids = search_results["IdList"]
    num_hits = len(ids)
    print("Found %d citations" % num_hits)
    num_to_dump = min(num_to_dump, num_hits)
    webenv = search_results["WebEnv"]
    query_key = search_results["QueryKey"]

    for start in range(0, num_to_dump, batch_size):
        print(start)
        fetch_handle = Entrez.efetch(
            db=database,
            # rettype="xml",
            # retmode="xml",
            retstart=start,
            retmax=batch_size,
            webenv=webenv,
            query_key=query_key)
        sleep(0.3)
        data = fetch_handle.read()
        # text = html2text.html2text(data)
        # print(text)
        fetch_handle.close()
        yield data


if __name__ == '__main__':

    data_io.write_to_file('./from_pubmed.xmls', pubmed_artice_generator())
예제 #7
0
    for query_keyword in ['Geschlecht', 'Eine', 'theorie',
                          'theory']:  #'women','Frau'
        for page_number in range(1, 200000):
            while True:
                try:
                    filenames = get_pdf_urls(query_keyword, page_number)
                    break
                except Exception:
                    print('retrying query: %s; page: %d' %
                          (query_keyword, page_number))
                    time.sleep(5)

            if len(filenames) == 0:
                break

            new_urls = [
                p for p in filenames if p not in already_downloaded_urls
            ]
            print('found %d new urls' % len(new_urls))
            already_downloaded_urls.extend(new_urls)
            for i, p in enumerate(new_urls):
                os.system('wget -N -b -q -P %s %s' % (data_dir, p))
                time.sleep(0.2)
                sys.stdout.write('\r')
                sys.stdout.flush()
                sys.stdout.write('\rnum-current page: %d' % i)
                sys.stdout.flush()
            data_io.write_to_file(urls_file, new_urls, 'ab')

            if len(already_downloaded_urls) % 100 == 0:
                print('got %d files' % len(already_downloaded_urls))
예제 #8
0
    return s


if __name__ == '__main__':
    data_path = '/home/tilo/code/NLP/scisci_nlp/data/scierc_data/json/'
    data = getting_scierc_tagged_data(data_path + 'dev.json')

    # for text,spans in data:
    #     for start,end,label in spans:
    #         print('%s - %s'%(text[start:end],label))
    path = './dingens'
    if not os.path.isdir(path):
        os.mkdir(path)

    for doc_name, text, spans, relations in data:
        data_io.write_to_file(path + '/' + doc_name + '.txt', [text])
        startend2Id = {
            '%d-%d' % (s, e): i
            for i, (s, e, l) in enumerate(spans)
        }

        def get_mention_id(s, e):
            return startend2Id['%d-%d' % (s, e)]

        lines = [
            span_to_ann_line(get_mention_id(s, e), s, e, l, text[s:e])
            for s, e, l in spans
        ]
        lines += [
            to_rel_ann_line(i, get_mention_id(s1, e1), get_mention_id(s2, e2),
                            label)
예제 #9
0
        optimizer.zero_grad()
        loss = loss_fun(model,batch).mean()  # mean() for Data Parallelism
        loss.backward()
        optimizer.step()
        return loss.item()

    pytorch_methods.train(train_on_batch,
                          data_iter,
                          cfg.n_epochs,
                          tol=cfg.tol,
                          patience=cfg.patience,
                          verbose=True)

if __name__ == '__main__':
    path = '/home/tilo/data'
    data_path = path+'/ml_nlp_parsed'
    lines_g = data_io.read_lines_from_files(data_path,limit=100)
    def line_generator():
        line_len=100
        c=0
        for k,line in enumerate(lines_g):
            for s in range(0,len(line)-line_len,line_len):
                yield line[s:(s+line_len)].replace('\n','')
                c+=1
                if c%10==0:
                    yield '\n'

    data_io.write_to_file('/tmp/text.txt',line_generator())
    data_file = '/tmp/text.txt'
    fire.Fire(main)
예제 #10
0
    # lines_g = (d['pdf_full_text'].replace('\n', '') for d in data_io.read_jsons_from_file('%s/arxiv.jsonl.gz' % path)
    #            if isinstance(d['pdf_full_text'],str) and len(d['pdf_full_text']) > 10)

    # lines_g = data_io.read_lines_from_files(data_path)
    lines_g = build_opensubtitles_lines_generator(data_source)

    corpus_path = '%s/de_corpus' % path
    # corpus_path = '%s/corpus' % '/tmp'
    build_train_valid_test_files = True
    if build_train_valid_test_files:
        if os.path.isdir(corpus_path):
            shutil.rmtree(corpus_path)
        if not os.path.exists(corpus_path): os.mkdir(corpus_path)
        num_train_docs = 20_000
        train_file = '%s/train.txt'% corpus_path
        data_io.write_to_file(train_file, (next(lines_g) for k in range(num_train_docs)))
        train_split_folder = corpus_path + '/train'
        if not os.path.exists(train_split_folder): os.mkdir(train_split_folder)
        train_lines_g = data_io.read_lines(train_file)
        num_train_docs = sum((1 for line in data_io.read_lines(train_file)))
        lines_per_split = 100
        num_train_splits = int(numpy.ceil(num_train_docs / lines_per_split))
        for k in range(num_train_splits):
            split_file = train_split_folder + '/train_split_%d'%k
            data_io.write_to_file(split_file, (next(train_lines_g) for k in range(lines_per_split)))

        # shutil.copy(train_file, train_split_folder + '/train_split_%d'%k)
        data_io.write_to_file('%s/valid.txt' % corpus_path, (next(lines_g) for k in range(lines_per_split)))
        data_io.write_to_file('%s/test.txt' % corpus_path, (next(lines_g) for k in range(lines_per_split)))

    is_forward_lm = True