def data_process():
    directory = "./inspec/all"
    articles = os.listdir(directory)
    all_data = []
    fullText = []

    text_articles = []
    for article in articles:
        if article.endswith(".abstr"):
            text_articles.append(article)
    text_articles.sort()

    keyp_articles = []
    for article in articles:
        if article.endswith('.uncontr'):
            keyp_articles.append(article)
    keyp_articles.sort()

    for article_ID in range(len(text_articles)):
        a = text_articles[article_ID].split('.')[0]
        b = keyp_articles[article_ID].split('.')[0]
        if a == b:
            articleFile = io.open(directory + "/" + text_articles[article_ID],
                                  'r')
            text = articleFile.read().strip()
            text = re.sub('[!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~]+', '',
                          str(text))
            text = text.replace('\\n', ' ').replace('\\t',
                                                    ' ').replace('\\r', ' ')
            words1 = text.split()
            words = [x.lower() for x in words1]
            fullText += words

            keyphraseFile = io.open(
                directory + "/" + keyp_articles[article_ID], 'r')
            keyphrases1 = keyphraseFile.read().strip().replace('; ', ' ')
            keyphrases = [x.lower() for x in keyphrases1.split()]

            tag = []
            for i in range(len(words)):
                if words[i] not in keyphrases:
                    tag.append(0)  # NO_KP
                elif words[i] in keyphrases and words[i - 1] not in keyphrases:
                    tag.append(1)  # BEGIN_KP
                else:
                    tag.append(2)  # INSIDE_KP

            all_data.append((words, tag))

    return all_data, fullText
Example #2
0
 def tokenizer(text):
     return [tok.lower() for tok in text.split(' ')]
Example #3
0
 def tokenizer(text):
     return [tok.lower() for tok in text.split(' ')]
 def tokenizer(text):
     return [token.lower() for token in text.split(" ")]