def data_process(): directory = "./inspec/all" articles = os.listdir(directory) all_data = [] fullText = [] text_articles = [] for article in articles: if article.endswith(".abstr"): text_articles.append(article) text_articles.sort() keyp_articles = [] for article in articles: if article.endswith('.uncontr'): keyp_articles.append(article) keyp_articles.sort() for article_ID in range(len(text_articles)): a = text_articles[article_ID].split('.')[0] b = keyp_articles[article_ID].split('.')[0] if a == b: articleFile = io.open(directory + "/" + text_articles[article_ID], 'r') text = articleFile.read().strip() text = re.sub('[!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~]+', '', str(text)) text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ') words1 = text.split() words = [x.lower() for x in words1] fullText += words keyphraseFile = io.open( directory + "/" + keyp_articles[article_ID], 'r') keyphrases1 = keyphraseFile.read().strip().replace('; ', ' ') keyphrases = [x.lower() for x in keyphrases1.split()] tag = [] for i in range(len(words)): if words[i] not in keyphrases: tag.append(0) # NO_KP elif words[i] in keyphrases and words[i - 1] not in keyphrases: tag.append(1) # BEGIN_KP else: tag.append(2) # INSIDE_KP all_data.append((words, tag)) return all_data, fullText
def tokenizer(text): return [tok.lower() for tok in text.split(' ')]
def tokenizer(text): return [tok.lower() for tok in text.split(' ')]
def tokenizer(text): return [token.lower() for token in text.split(" ")]