def process_token_list(self, token_list):
        """
        Takes the token_list returned from extract_entity_tokens(), processes
        the non-entity portions and combines all tokens into a single list. It
        also stores the entity names and list idxs in a dictionary

        Parameters:
            token_list (list, required): list of texts to be tokenized and already
                                         tokenized entities
        Returns:
            tokens (list): ordered list of tokens for the whole text
            entity_idxs (list): list of items and their token idx for each list
        """
        tokens = []
        entity_idxs = []
        CWT = ChemWordTokenizer()
        for j, item in enumerate(token_list):
            if isinstance(item, str):
                item_tokens = CWT.tokenize(item)
                ### Split numbers from common units
                split_tokens = []
                for token in item_tokens:
                    split_tokens += self.split_token(token)
                tokens += split_tokens
            else:
                tokens += item
                item_idx = len(tokens) - 1
                entity_idxs.append([item[0], item_idx])
        return tokens, entity_idxs
Ejemplo n.º 2
0
def chem_tokenize(text):
    cwt = ChemWordTokenizer()
    tokens = cwt.tokenize(text)
    token_indexs = cwt.span_tokenize(text)
    tokenized_info = []
    for token_index, token in zip(token_indexs, tokens):
        tokenized_info.append((token, token_index[0], token_index[1] - 1))
    return tokenized_info
# Builds on top of ChemParserBatch_v7
#   Takes all paragraphs, from the paper. No random selection
# v1 -> Increases speed by just appending to the csv file (instead of writing the whole dataframe)

from chemdataextractor.nlp.tokenize import ChemWordTokenizer
import chemdataextractor as cde
import pandas as pd
import random
import tqdm
import sys
import os
cwt = ChemWordTokenizer()

def read_file_list(file_list_path):
    open_file = open(file_list_path, 'r')
    file_list = []
    DOI_list = []
    for item in open_file.readlines():
        # Customize for each case
        file_name = item.replace('\n', '').split('\t')[0].split('/')[2]
        DOI = item.replace('\n', '').split('\t')[1]
        file_list.append(file_name)
        DOI_list.append(DOI)
    return(file_list, DOI_list)  

def clean_paragraph(paragraph):
    par_clean = paragraph.text.replace('\n',' ')
    par_clean = par_clean.replace('/',' / ')
    par_clean = par_clean.replace('‑','-')
    par_clean = par_clean.replace('- ','-')
    return(par_clean)
Ejemplo n.º 4
0
 def __init__(self):
     self.cwt = ChemWordTokenizer()