def topic_finder(document):
    """
    Main method for finding topics in text, document.
    :param document: sample text
    :return: list of possible topics
    """
    topics_list = []
    try:
        important_nouns = document.find_topic()
        trigram_tagger = pickle.load(open('trained_tagger.pkl',
                                          'rb'))  # train_tagger()
        sentences = pre.tokenize_to_sentences(
            pre.remove_punctuation(document.sample))
        sentences = [pre.tokenize_to_words(sent) for sent in sentences]
        sentences = [
            sentence for sentence in sentences if important_nouns[0].lower() in
            [word.lower() for word in sentence]
        ]
        tagged_sentences = [trigram_tagger.tag(sent) for sent in sentences]
        svo_data = [
            get_svo(sentence, important_nouns[0])
            for sentence in tagged_sentences
        ]
        for svo in svo_data:
            sentence = ''
            for word in svo:
                sentence += word[0] + ' '
            topics_list.append(sentence)
    except IndexError:
        if not topics_list:
            topics_list.append("Topic not found. Need more data.")
    return topics_list
def preprocess_doc(doc):
    doc = preprocessing.tokenize(doc)
    doc = preprocessing.remove_punctuation(doc)
    doc = preprocessing.remove_numbers(doc)
    doc = preprocessing.lower(doc)
    doc = preprocessing.remove_common_stopwords(doc)
    doc = preprocessing.clean_doc(doc)
    return doc
Exemple #3
0
def preprocess_corpus(documents):
    documents = list(map(preprocessing.tokenize, documents))
    documents = [preprocessing.remove_punctuation(doc) for doc in documents]
    documents = [preprocessing.remove_numbers(doc) for doc in documents]
    documents = [preprocessing.lower(doc) for doc in documents]
    documents = [preprocessing.remove_common_stopwords(doc) for doc in documents]
    documents = [preprocessing.clean_doc(doc) for doc in documents]
    documents = [doc for doc in documents if doc]
    return documents
Exemple #4
0
 def full_preprocessing(self):
     """General preprocessing on document sample. This method include:
     remove punctuation ( . and , are kept), remove english stop words,
     tokenize to sentences, words and list of tokenized sentences to words."""
     self.text = pre.remove_punctuation(self.text)
     self.text = pre.to_lowercase(self.text)
     self.words = pre.tokenize_to_words(self.text)
     self.words = pre.remove_stopwords(self.words)
     self.text = ' '.join(self.words)
     self.sentences = pre.tokenize_to_sentences(self.text)
     self.normalized_sample = [pre.tokenize_to_words(sent) for sent in self.sentences]
     return self.sentences
def clean_data():
    """
    Clean the Tweet by removing punctuations and stop words
    :return cleaned data:
    """
    data = sc.textFile("data/data.txt")
    col_rdd = data.map(lambda x: (x.split('\t')[0], x[-1]))
    punctuation_removed_rdd = col_rdd.map(
        lambda x: (remove_punctuation(x[0]), float(x[1])))

    data_df = sqlContext.createDataFrame(punctuation_removed_rdd,
                                         ["text", "label"])
    remover = StopWordsRemover(inputCol="text",
                               outputCol="words",
                               stopWords=stopwords.words('english'))
    return remover.transform(data_df).select(["label", "words"])
Exemple #6
0
def preprocess_doc(row, context=True):
    citation_sentence = str(row['context'])
    if lda_params['markers']:
        citation_sentence = preprocessing.remove_markers(citation_sentence)
    if lda_params['tokenize']:
        citation_sentence = preprocessing.tokenize(citation_sentence)
    if lda_params['pos_tags'] != ():
        tags = preprocessing.lower(
            preprocessing.filter_pos_tags(citation_sentence,
                                          tags=lda_params['pos_tags']))
    if lda_params['punctuation']:
        citation_sentence = preprocessing.remove_punctuation(citation_sentence)
    if lda_params['numbers']:
        citation_sentence = preprocessing.remove_numbers(citation_sentence)
    citation_sentence = preprocessing.lower(citation_sentence)
    if lda_params['bigrams']:
        bigrams = preprocessing.get_bigrams(citation_sentence)
    if lda_params['trigrams']:
        trigrams = preprocessing.get_trigrams(citation_sentence)
    if lda_params['common_stopwords']:
        citation_sentence = preprocessing.remove_common_stopwords(
            citation_sentence)
    if lda_params['custom_stopwords']:
        citation_sentence = preprocessing.remove_custom_stopwords(
            citation_sentence)
    if lda_params['pos_tags'] != ():
        citation_sentence = preprocessing.filter_pos(citation_sentence, tags)
    citation_sentence = preprocessing.clean_doc(citation_sentence)
    if lda_params['bigrams']:
        bigrams = preprocessing.filter_n_grams(bigrams, citation_sentence)
    if lda_params['trigrams']:
        trigrams = preprocessing.filter_n_grams(trigrams, citation_sentence)
    if lda_params['bigrams'] and not lda_params['trigrams']:
        citation_sentence = citation_sentence + bigrams
    if lda_params['trigrams'] and not lda_params['bigrams']:
        citation_sentence = citation_sentence + trigrams
    if lda_params['bigrams'] and lda_params['trigrams']:
        citation_sentence = citation_sentence + bigrams + trigrams
    if lda_params['lemmatize']:
        citation_sentence = preprocessing.lemmatize(citation_sentence)
    citation_sentence = preprocessing.clean_doc(citation_sentence)
    return citation_sentence
def get_defaults(query={
    "controller_status": "",
    "lumen": "",
    "voltage": "",
    "driver_current": ""
},
                 number_top_matches: int = 2,
                 ds=data,
                 ds_c_status_embeddings: list = [],
                 ds_lm_embeddings: list = [],
                 ds_v_embeddings: list = [],
                 ds_dcurr_embeddings: list = [],
                 query_embeddings: dict = {},
                 model=model,
                 preprocess_ds: bool = False):

    for key in query:

        if len(query[key]):
            query[key] = pr.to_lower(query[key])
            query[key] = pr.remove_punctuation(query[key])

    if preprocess_ds:
        ds = ds.fillna("")
        ds["controller_status"] = ds["controller_status"].apply(
            lambda x: pr.to_lower(x))
        ds["controller_status"] = ds["controller_status"].apply(
            lambda x: pr.remove_punctuation(x))

        ds["lumen"] = ds["lumen"].apply(lambda x: pr.to_lower(x))
        ds["lumen"] = ds["lumen"].apply(lambda x: pr.remove_punctuation(x))

        ds["voltage"] = ds["voltage"].apply(lambda x: pr.to_lower(x))
        ds["voltage"] = ds["voltage"].apply(lambda x: pr.remove_punctuation(x))

        ds["driver_current"] = ds["driver_current"].apply(
            lambda x: pr.to_lower(x))
        ds["driver_current"] = ds["driver_current"].apply(
            lambda x: pr.remove_punctuation(x))

    if not ds_c_status_embeddings:
        ds_c = ds["controller_status"].tolist()
        ds_c_status_embeddings = [model.encode(c) for c in ds_c]

    if not ds_lm_embeddings:
        ds_lm = ds["lumen"].tolist()
        ds_lm_embeddings = [model.encode(lm) for lm in ds_lm]

    if not ds_v_embeddings:
        ds_v = ds["voltage"].tolist()
        ds_v_embeddings = [model.encode(v) for v in ds_v]

    if not ds_dcurr_embeddings:
        ds_dcurr = ds["driver_current"].tolist()
        ds_dcurr_embeddings = [model.encode(curr) for curr in ds_dcurr]

    scores = []

    query_embeddings = {}
    if len(query["controller_status"]):
        query_embeddings["controller_status"] = model.encode(
            query["controller_status"])
        contr_scores = compute_similarity(
            query_embeddings["controller_status"], ds_c_status_embeddings)
        scores.append(contr_scores)

    if len(query["lumen"]):
        query_embeddings["lumen"] = model.encode(query["lumen"])
        lm_scores = compute_similarity(query_embeddings["lumen"],
                                       ds_lm_embeddings)
        scores.append(lm_scores)

    if len(query["voltage"]):
        query_embeddings["voltage"] = model.encode(query["voltage"])
        v_scores = compute_similarity(query_embeddings["voltage"],
                                      ds_v_embeddings)
        scores.append(v_scores)

    if len(query["driver_current"]):
        query_embeddings["driver_current"] = model.encode(
            query["driver_current"])
        v_scores = compute_similarity(query_embeddings["driver_current"],
                                      ds_dcurr_embeddings)
        scores.append(v_scores)

    mean_score = np.mean(scores, axis=0).tolist()

    top_indices = [np.array(mean_score).argsort()[-number_top_matches:][::-1]]

    #sorted_scores = sorted(mean_score, reverse=True)
    #top_scores = sorted_scores[:number_top_matches]

    similar_cases = ds.iloc[top_indices[0]]

    result_df = pd.DataFrame()
    result_df["reason"] = similar_cases["reason"]

    return result_df
def build_model(documents):
    if lda_params['markers']:
        documents = map(preprocessing.remove_markers, documents)
    if lda_params['tokenize']:
        documents = map(preprocessing.tokenize, documents)
    documents = list(documents)
    if lda_params['pos_tags'] != ():
        tags = [
            preprocessing.lower(
                preprocessing.filter_pos_tags(doc,
                                              tags=lda_params['pos_tags']))
            for doc in documents
        ]
    if lda_params['punctuation']:
        documents = [
            preprocessing.remove_punctuation(doc) for doc in documents
        ]
    if lda_params['numbers']:
        documents = [preprocessing.remove_numbers(doc) for doc in documents]
    documents = [preprocessing.lower(doc) for doc in documents]
    if lda_params['bigrams']:
        bigrams = [preprocessing.get_bigrams(doc) for doc in documents]
    if lda_params['trigrams']:
        trigrams = [preprocessing.get_trigrams(doc) for doc in documents]
    if lda_params['common_stopwords']:
        documents = [
            preprocessing.remove_common_stopwords(doc) for doc in documents
        ]
    if lda_params['custom_stopwords']:
        documents = [
            preprocessing.remove_custom_stopwords(doc) for doc in documents
        ]
    if lda_params['pos_tags'] != ():
        documents = [
            preprocessing.filter_pos(documents[i], tags[i])
            for i in range(0, len(documents))
        ]
    documents = [preprocessing.clean_doc(doc) for doc in documents]
    if lda_params['bigrams']:
        bigrams = [
            preprocessing.filter_n_grams(bigrams[i], documents[i])
            for i in range(0, len(documents))
        ]
    if lda_params['trigrams']:
        trigrams = [
            preprocessing.filter_n_grams(trigrams[i], documents[i])
            for i in range(0, len(documents))
        ]
    if lda_params['bigrams'] and not lda_params['trigrams']:
        documents = [
            documents[i] + bigrams[i] for i in range(0, len(documents))
        ]
    if lda_params['trigrams'] and not lda_params['bigrams']:
        documents = [
            documents[i] + trigrams[i] for i in range(0, len(documents))
        ]
    if lda_params['bigrams'] and lda_params['trigrams']:
        documents = [
            documents[i] + bigrams[i] + trigrams[i]
            for i in range(0, len(documents))
        ]
    if lda_params['lemmatize']:
        documents = [preprocessing.lemmatize(doc) for doc in documents]
    documents = [preprocessing.clean_doc(doc) for doc in documents]
    documents = [doc for doc in documents if doc]

    dictionary = generate_dictionary(documents)
    corpus = generate_corpus(documents, dictionary)
    lda_model = generate_lda_model(corpus, dictionary,
                                   lda_params['num_topics'])

    if not os.path.exists(lda_params['model_dir']):
        os.makedirs(lda_params['model_dir'])
    dictionary.save(lda_params['model_dir'] + 'lda.dict')
    gensim.corpora.MmCorpus.serialize(lda_params['model_dir'] + 'lda.mm',
                                      corpus)
    lda_model.save(lda_params['model_dir'] + 'lda.model')
    with open(lda_params['model_dir'] + 'lda.docs', 'wb') as docs_file:
        pickle.dump(documents, docs_file, pickle.HIGHEST_PROTOCOL)
    with open(lda_params['model_dir'] + 'lda_params.config',
              'w') as config_file:
        config_file.write(str(lda_params))