Ejemplo n.º 1
0
def clear_documents(docs):
    tokenizer = RegexpTokenizer(r'\w+')

    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    # with open(f"{Settings.DIRECTORY}/data/words/{Settings.PROJECT_NAME}_{Settings.ID}", 'w') as f:
    #     for d in docs:
    #         f.write(d + "\n")

    #     f.write("\n")

    # Clean text based on java stop words
    docs_content = []
    for doc in docs:
        directory = f"{Settings.DIRECTORY}/data/words/{Settings.PROJECT_NAME}/{doc[0]}"
        with open(directory, "w+") as f:
            f.write(f"{doc[0]}\n")
            f.write("Before processing:\n")
            f.write(f"{doc[1]}\n")

        doc_content = StringUtils.clear_text(doc[1])
        docs_content.append(doc_content)

        with open(directory, "a+") as f:
            f.write("\nAfter processing:\n")
            f.write(f"{doc_content}\n")

    # compile sample documents into a list
    doc_set = docs_content

    # list for tokenized documents in loop
    texts = []

    # loop through document list
    for text in doc_set:
        # clean and tokenize document string
        raw = text.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [t for t in tokens if not t in en_stop]

        # stem tokens
        stemmed_tokens = [p_stemmer.stem(st) for st in stopped_tokens]

        # add tokens to list
        texts.append(stemmed_tokens)

    # turn tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # filter dictionary from outliers
    dictionary.filter_extremes(no_below=3, no_above=0.75, keep_n=1000)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    return texts, corpus, dictionary
Ejemplo n.º 2
0
    def apply_tfidf_to_pair(self, source, target):
        source = StringUtils.clear_text(source)
        target = StringUtils.clear_text(target)

        return self.cosine_sim(source, target)