コード例 #1
0
def extract_candidates_doc(doc, phrase_list, idf_vec, training_size=450):

    #vocab = set(phrase_list)
    idf_dic = {}
    #print "phrase list len", len(phrase_list)
    #print "len idf_vec", len(idf_vec)
    for i, phrase in enumerate(phrase_list):
        idf_dic[phrase] = idf_vec[i]
    noun_phrases = set()
    print "--extracting NP"
    noun_phrases = set(
        [lemmatize(phrase) for phrase in extract_candidate_chunks(doc)])

    vectorizer = TfidfVectorizer(decode_error='ignore',
                                 preprocessor=preprocess,
                                 ngram_range=(1, 3),
                                 tokenizer=tokenize)
    analyzer = vectorizer.build_analyzer()
    phrases = list(
        set([
            phrase for phrase in analyzer(doc)
            if valid_ngram(phrase, noun_phrases)
        ]))
    doc = preprocess(doc)

    #print "candidate phrases", phrases
    #tfidf = []
    #first_occurrence = []
    #entropy = []
    #length = []
    doc_len = len(doc)

    entropy = get_entropy_doc(doc, phrases)
    # get feature vectors
    features = []
    for i, phrase in enumerate(phrases):
        first_occurrence = doc.find(phrase) / doc_len
        tf = doc.count(phrase)
        if phrase in idf_dic:
            tfidf = tf * idf_dic[phrase]
        else:
            tfidf = tf * log10(training_size)
        feature_vec = get_feature_vector(phrase, tfidf, first_occurrence,
                                         entropy[i])
        features.append(feature_vec)
    return phrases, features
コード例 #2
0
def extract_candidates_doc(doc, phrase_list, idf_vec, training_size = 450):

    #vocab = set(phrase_list)
    idf_dic = {}
    #print "phrase list len", len(phrase_list)
    #print "len idf_vec", len(idf_vec)
    for i, phrase in enumerate(phrase_list):
        idf_dic[phrase] = idf_vec[i]
    noun_phrases = set()
    print "--extracting NP"
    noun_phrases = set([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)])

    vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize)
    analyzer = vectorizer.build_analyzer()
    phrases = list(set([phrase for phrase in analyzer(doc) if valid_ngram(phrase, noun_phrases)]))
    doc = preprocess(doc)

    #print "candidate phrases", phrases
    #tfidf = []
    #first_occurrence = []
    #entropy = []
    #length = []
    doc_len = len(doc)

    entropy = get_entropy_doc(doc, phrases)
    # get feature vectors
    features = []
    for i, phrase in enumerate(phrases):
        first_occurrence = doc.find(phrase) / doc_len
        tf = doc.count(phrase)
        if phrase in idf_dic:
            tfidf = tf * idf_dic[phrase]
        else:
            tfidf = tf * log10(training_size)
        feature_vec = get_feature_vector(phrase, tfidf, first_occurrence, entropy[i])
        features.append(feature_vec)
    return phrases, features
コード例 #3
0
def lemmatize_content(text):
    tmp = []
    lemmatizer = WordNetLemmatizer()   
    for word in text.split():
        tmp.append(lemmatize(word), lemmatizer)
    return ' '.join(tmp)
コード例 #4
0
def lemmatize_content(text):
    tmp = []
    for word in text.split():
        tmp.append(lemmatize(word))
    return ' '.join(tmp)
コード例 #5
0
import preprocess
import pandas as pd

filename = 'dataset_related.csv'
output = 'data_related_extracted/data_related_extracted_remstop.txt'

df = pd.read_csv(filename)
df_extract = df.loc[:, ['tweet', 'class']]
df_extract = df_extract.dropna()
df_extract = df_extract.drop_duplicates()

tweets = df_extract.tweet.values.tolist()
classes = df_extract['class'].values.tolist()

# Preprocess
for i in range(len(tweets)):
    tweets[i] = tweets[i].replace('\n', ' ')
    tweets[i] = preprocess.preprocess(tweets[i])
    tweets[i] = preprocess.remove_punc(tweets[i])
    tweets[i] = preprocess.lemmatize(tweets[i])
    tweets[i] = preprocess.remove_stopwords(tweets[i])

s = []
for tweet, cl in zip(tweets, classes):
    s.append(tweet + '\t' + str(int(cl)) + '\n')

with open(output, 'wb') as f:
    for x in s:
        f.write(x.encode('utf-8'))
コード例 #6
0
ファイル: utils.py プロジェクト: haluk/NLP_course_materials
def tokenizer(doc):
    doc = preprocess.clean_text(doc)

    return preprocess.lemmatize(doc)
コード例 #7
0
from scraper import *
import preprocess
from sys import argv
import yaml
import index

# Main script
if __name__ == '__main__':
    if "scrape" in argv:
        with open('sites.yaml', 'rb') as file:
            pages = yaml.load(file)

            Scraper(pages).downloadarticlesforeverywebsite()

    if 'lemmatize' in argv:
        preprocess.lemmatize()

    if 'index' in argv:
        index.index()