Example #1
0
def tokenize(text):
    docs = nlp(text)
    tokens = [token.text for token in docs]

    punctuation = list(punctuation)
    punctuation.append('\n')
    tokens = [token for token in tokens if token not in punctuation]

    return tokens
Example #2
0
    def save_article(self):
        from string import punctuation

        # print(punctuation)

        punctuation=list(punctuation)
        punctuation.append('\n')
        
        # for sent in self.ParseText:
            # print(sent)
        test=self.ParseText.split(' ')
        print(test)

        # tokens=[ token for token in self.ParseText if token not in punctuation ]

        # print(tokens)

        # punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~''\“\”/'

        # for sentence in self.ParseText.sentences:
        #     for ele in sentence:
        #         if ele in punc:
        #             sentence=sentence.replace(ele,"")
        # print(sentence)


        


        # row_contents=[f"{self.get_article_title()}",f"{self.get_article_author()}",f"{self.get_article_date()}",f"{self.get_article_summary()}",f"{self.get_article_tags()}"]

        # with open(self.file_name,'a+',newline='') as  write_obj:

        #     csv_writter=writer(write_obj)

        #     csv_writter.writerow(row_contents)


        # print(row_contents)
        
        pass
    def classify_article(self, articles_filename):
        # Load Classifier:
        idf_liwc_article, idf_punc_article, idf_liwc_title, idf_punc_title = load_dataset("train_idf_by_publisher.pkl")
        classifier = load("bypublisher_classification_model.pkl")
        svd = load("bypublisher_svd_model.pkl")
        tfidf = load("bypublisher_tfidf_model.pkl")

        for article in self.efficient_read_article_text(articles_filename):
            for feature, words in self.liwc_features.items():
                liwc_in_article = 0
                liwc_in_title = 0
                for word in words:
                    counts_articles = article.text.count(word)
                    counts_title = article.title.count(word)
                    liwc_in_article += counts_articles
                    liwc_in_title += counts_title
                    article.liwc_counts[feature] += counts_articles
                    article.liwc_counts_title[feature] += counts_title
                    article.all_liwc += counts_articles
                    article.all_liwc_title += counts_title

            for feature, words in self.punctuations.items():
                punc_in_article = 0
                punc_in_title = 0
                for word in words:
                    counts_articles = article.text.count(word)
                    counts_title = article.title.count(word)
                    punc_in_article += counts_articles
                    punc_in_title += counts_title
                    article.punctuation_counts[feature] += counts_articles
                    article.punctuation_counts_title[feature] += counts_title
                    article.all_punc += counts_articles
                    article.all_punc_title += counts_title
            # Prepare features
            unigrams = tfidf.transform([" ".join(article.clean_article())])
            liwc = []
            punctuation = []
            structure = [article.count_quotes,
                         article.count_paragraphs,
                         article.count_urls]

            for feature in self.liwc_features.keys():
                tf_article = 0
                if article.all_liwc != 0:
                    tf_article = article.liwc_counts[feature] / article.all_liwc
                liwc.append(tf_article * idf_liwc_article[feature])

            for feature in self.punctuations.keys():
                tf_article = 0
                if article.all_punc != 0:
                    tf_article = article.punctuation_counts[feature] / article.all_punc
                if article.all_punc_title != 0:
                    tf_title = article.punctuation_counts_title[feature] / article.all_punc_title
                punctuation.append(tf_article * idf_punc_article[feature])

            test_article = hstack([unigrams, [liwc], [punctuation], [structure]])
            test_article = svd.transform(test_article)  # remove svd
            # Classify Article
            clf_pred = classifier.predict(test_article)[0]
            prediction = ("true" if clf_pred == 1 else "false")
            confidence = 0.0
            # Output prediction
            self.outFile.write(article.id + " " + prediction + " " + str(confidence) + "\n")
Example #4
0
    turn = []
    speaker = []
    pss = []

    for i, val in enumerate(z):
        x = z[i]
        t = x[0]
        s = x[1]
        m = x[2]
        m = m.split(' ')
        p = []
        #w = []
        for n in m:
            temp = n.split('|')
            p.append(temp[0])
            #w.append(temp[1])
        turn.append(t)
        speaker.append(s)
        pss.append(p)
        #wds.append(w) 
    pw = list(zip(turn,speaker,pss))


    lines = []
    for i in pw:
        num = str(i[0])
        speaker = i[1]
        #wds = i[3]
        pss = i[2]
        for j, val in enumerate(pss):
def POS(pathin, file):
    import os
    from string import punctuation as p

    os.chdir(pathin)

    removelist = ['\t', '\r']

    with open(file, 'r') as f:
        text = f.read()
        for item in removelist:
            text = text.replace(item, '')
        text = text.split('@')

        #print(text)

    for line in text:
        if line.startswith('Participants:'):
            p = line.replace('Participants:', '')
            p = p.split(',')

        p1 = []
        for i, val in enumerate(p):
            v1 = val.replace(' ', '')
            v2 = v1.replace('\n', '')
            p1.append(v2[0:3])
        people = p1

    with open(file, 'r') as f:
        text = f.read()
        for item in removelist:
            text = text.replace(item, '')
        text = text.split('\n')

        trans = []
        for line, val in enumerate(text):
            label = val[1:4]
            if label in people and text[line + 1].startswith('%mor'):
                trans.append(val)
            elif label == 'mor':
                trans.append(val)

    grouped = [trans[n:n + 2] for n in range(0, len(trans), 2)]

    p = '.!?'

    turn = []
    speaker = []
    mor = []
    for i, val in enumerate(grouped):
        turn.append(i + 1)

    for i in grouped:
        s = i[0]
        m = i[1]
        for c in p:
            m = m.replace(c, '')
            m = m.strip()
            m = m.replace('%mor:', '')
            speaker.append(s[1:4])
            mor.append(m)

    z = list(zip(turn, speaker, mor))

    turn = []
    speaker = []
    pss = []

    for i, val in enumerate(z):
        x = z[i]
        t = x[0]
        s = x[1]
        m = x[2]
        m = m.split(' ')
        p = []
        #w = []
        for n in m:
            temp = n.split('|')
            p.append(temp[0])
            #w.append(temp[1])
        turn.append(t)
        speaker.append(s)
        pss.append(p)
        #wds.append(w)
    pw = list(zip(turn, speaker, pss))

    lines = []
    for i in pw:
        num = str(i[0])
        speaker = i[1]
        #wds = i[3]
        pss = i[2]
        for j, val in enumerate(pss):
            p = pss[j]
            #w = wds[j]
            #temp = num + ',' + speaker + ',' + w + ',' + p + '\n'
            temp = num + ',' + speaker + ',' + p + '\n'
            temp = num + ',' + speaker + ',' + ',' + p + '\n'
            temp = num + ',' + speaker + ',' + p + '\n'
            lines.append(temp)

    #header = 'Turn,' + 'Speaker,' + 'Word' + 'POS' + '\n'
    header = 'Turn,' + 'Speaker,' + 'POS' + '\n'
    data = ''.join(lines)
    output = header + data
    return (output)
Example #6
0
stopWords = list(STOP_WORDS)
nlp = spacy.load('en_core_web_sm')
# print(stopWords)

docs = nlp(text)

# print(docs)

tokens = [token.text for token in docs]

# print(tokens)

punctuation = list(punctuation)
# print(punctuation)
punctuation.append('\n')
# print(punctuation)
# punctuation=punctuation +'\n'
# print(punctuation)

tokens = [token for token in tokens if token not in punctuation]

word_frequency = {}
for word in docs:
    if word.text.lower() not in stopWords:
        if (word.text.lower() not in punctuation):
            if word.text not in word_frequency.keys():
                word_frequency[word.text] = 1
            else:
                word_frequency[word.text] += 1
Example #7
0
    def classify_article(self, articles_filename):
        # Load Classifier:
        idf_liwc_article, idf_punc_article, idf_liwc_title, idf_punc_title = load_dataset("train_idf_by_articles.pkl")
        classifier = load("byarticle_classification_model.pkl")
        tfidf = load("byarticle_article_tfidf_model.pkl")
        tfidf_title = load("byarticle_title_tfidf_model.pkl")
        for article in self.efficient_read_article_text(articles_filename):
            article_emotions = np.sum(list(map(self.emotions_word, article.text.split(" "))), axis=0)
            for feature, words in self.liwc_features.items():
                liwc_in_article = 0
                liwc_in_title = 0
                for word in words:
                    counts_articles = article.text.count(word)
                    counts_title = article.title.count(word)
                    liwc_in_article += counts_articles
                    liwc_in_title += counts_title
                    article.liwc_counts[feature] += counts_articles
                    article.liwc_counts_title[feature] += counts_title
                    article.all_liwc += counts_articles
                    article.all_liwc_title += counts_title

            for feature, words in self.punctuations.items():
                punc_in_article = 0
                punc_in_title = 0
                for word in words:
                    counts_articles = article.text.count(word)
                    counts_title = article.title.count(word)
                    punc_in_article += counts_articles
                    punc_in_title += counts_title
                    article.punctuation_counts[feature] += counts_articles
                    article.punctuation_counts_title[feature] += counts_title
                    article.all_punc += counts_articles
                    article.all_punc_title += counts_title
            # Prepare features
            unigrams = tfidf.transform([" ".join(article.clean_article())])
            title_unigrams = tfidf_title.transform([article.title])
            liwc = []
            punctuation = []
            structure = [article.count_quotes,
                         article.count_paragraphs,
                         article.count_urls]

            for feature in self.liwc_features.keys():
                tf_article = 0
                if article.all_liwc != 0:
                    tf_article = article.liwc_counts[feature] / article.all_liwc
                liwc.append(tf_article * idf_liwc_article[feature])

            for feature in self.punctuations.keys():
                tf_article = 0
                if article.all_punc != 0:
                    tf_article = article.punctuation_counts[feature] / article.all_punc
                if article.all_punc_title != 0:
                    tf_title = article.punctuation_counts_title[feature] / article.all_punc_title
                punctuation.append(tf_article * idf_punc_article[feature])

            test_article = hstack([unigrams, title_unigrams, liwc, punctuation, structure, article_emotions])
            # Classify Article
            clf_pred = classifier.predict(test_article)[0]
            prediction = ("true" if clf_pred == 1 else "false")
            # confidence = max(classifier.predict_proba(test_article)[0])
            confidence = 0.0
            # Output prediction
            print(article.id + " " + prediction + " " + str(confidence))
            self.outFile.write(article.id + " " + prediction + " " + str(confidence) + "\n")
Example #8
0
    def get_punctuation(self):
        from string import punctuation
        punctuation = list(punctuation)
        punctuation.append('\n')

        return punctuation