def textExtraction(df, series):
    vectorizer = CountVectorizer(analyzer = text_process, min_df = 0.1)
    df[series] = df[series].replace(np.nan, '', regex=True)
    vectorizer.fit_transform(df[series])
    vocab = vectorizer.get_feature_names()
    
    return vocab
Example #2
0
 def tokenize(self, analyzer='word', ngram_range=(1, 1)):
     text = self.tweet['text']
     if text:
         vec = CountVectorizer(analyzer=analyzer, ngram_range=ngram_range,
                               lowercase=False)
         vec.fit_transform([text])
         self.tw_features = self.tw_features.union(set(vec.get_feature_names()))
	def score(self, curr_example):	
		# pdb.set_trace()	
		processed_example = self.add_start_end_tokens(curr_example)
		example_length = len(processed_example.split())

		trigram_vectorizer = CountVectorizer(ngram_range=(3,3),\
												min_df=1,\
												max_df=1.0,\
												lowercase=True,
												analyzer="word",
												token_pattern=self.VECTORIZER_TOKEN_PATTERN)
		trigram_vectorizer.fit_transform([processed_example])
		trigram_count_matrix = trigram_vectorizer.transform([processed_example])

		score = 0
		for gram, count in zip(trigram_vectorizer.get_feature_names(), np.asarray(trigram_count_matrix.sum(axis=0)).ravel()):
			score += count * math.log(self.ngrams_dict[gram] + 1)
			leading_bigram = self.get_leading_bigram(gram)
			score -= count * math.log(self.ngrams_dict[leading_bigram] + self.training_vocab_size)

		## Calculate perplexity for scoring
		exponent = -float(1) / example_length
		pp_score = math.pow(math.exp(score), exponent) if math.exp(score) > 0.0 else float('+inf')

		return pp_score
Example #4
0
def get_features_by_wordbag():
    global max_features
    x_train, x_test, y_train, y_test=load_all_files()

    vectorizer = CountVectorizer(
                                 decode_error='ignore',
                                 strip_accents='ascii',
                                 max_features=max_features,
                                 stop_words='english',
                                 max_df=1.0,
                                 min_df=1 )
    print vectorizer
    x_train=vectorizer.fit_transform(x_train)
    x_train=x_train.toarray()
    vocabulary=vectorizer.vocabulary_

    vectorizer = CountVectorizer(
                                 decode_error='ignore',
                                 strip_accents='ascii',
                                 vocabulary=vocabulary,
                                 stop_words='english',
                                 max_df=1.0,
                                 min_df=1 )
    print vectorizer
    x_test=vectorizer.fit_transform(x_test)
    x_test=x_test.toarray()

    return x_train, x_test, y_train, y_test
Example #5
0
def TFIDF():
    global segcont
    global weight
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(segcont))
    word = vectorizer.get_feature_names()  # 所有文本的关键字
    weight = tfidf.toarray()  # 对应的tfidf矩阵
    del segcont

    seg = []
    for i in range(len(weight)):
        enstr = ""
        for j in range(len(word)):
            if weight[i][j] >= 0.1:#####################################
                enstr = enstr + " " + word[j]
        seg.append(enstr)

    del weight
    vec = CountVectorizer()
    tra = TfidfTransformer()
    tidf = tra.fit_transform(vec.fit_transform(seg))
    wo = vec.get_feature_names()
    we = tidf.toarray()

    global we
Example #6
0
def wordexist():
	corpus = [
        'UNC played Duke in basketball',
        'Duke lost the basketball game' ]
	vectorizer = CountVectorizer()
	print vectorizer.fit_transform(corpus).todense()
	print vectorizer.vocabulary_  #词库表
Example #7
0
def createCorpus(data,i, binaryX="False", stopWords=None, lemmatize="False", tfidf= "False", useidf="True"):  # will vectorize BOG using frequency as the parameter and will return the required arrays
    X_train =[]
    X_test=[]
    Y_train=[]
    Y_test=[]

    for key in data:
        if key in i:

            for filename in data[key]:
                text = data[key][filename][0]
                if lemmatize == "True":
                    port =  WordNetLemmatizer()
                    text = " ".join([port.lemmatize(k,"v") for k in text.split()])
                X_test.append(text)
                Y_test.append(data[key][filename][1])
        else:
            for filename in data[key]:
                text = data[key][filename][0]
                if lemmatize == "True":
                    port =  WordNetLemmatizer()
                    text = " ".join([port.lemmatize(k,"v") for k in text.split()])
                X_train.append(text)
                Y_train.append(data[key][filename][1])
    if tfidf == "False":
        vectorizer = CountVectorizer(min_df=1, binary= binaryX, stop_words=stopWords)
        X_train_ans = vectorizer.fit_transform(X_train)
        X_test_ans = vectorizer.transform(X_test)
        return X_train_ans, Y_train, X_test_ans,Y_test
    elif tfidf == "True":
        vectorizer = TfidfVectorizer(min_df=1, use_idf=useidf)
        X_train_ans = vectorizer.fit_transform(X_train)
        X_test_ans = vectorizer.transform(X_test)

        return X_train_ans, Y_train, X_test_ans,Y_test
def filter_corpora(corpora, num_top_wrds, skip=0):
    """Filter each inputted corpus into only `num_top_words` after `skip` words.

    Args: 
    ----
        corpora: list of tuples
            (name (str), corpus (1d np.ndarray of strings) pairs)
        num_top_words :int
        skip (optional): int
            allows for looking at the second, third, fourth `num_top_wrds`

    Return:
    ------
        filtered_corpora: list of tuples
    """

    num_top_wrds += skip 
    vectorizer = CountVectorizer(max_features=num_top_wrds, stop_words='english')

    filtered_corpora = []
    for name, corpus in corpora:
        vectorizer.fit_transform(corpus)
        most_common_wrds = vectorizer.get_feature_names()[skip:]
        filtered_corpora.append((name, most_common_wrds))

    return filtered_corpora
class RandomForestRegressor(Regressor):
   def findImportantFeatures(self, numFeatures = 500):
      self.features = []
      count = 0
      if self.isGroup:
         for key in sorted(self.trainSet.getVocabulary(), key = lambda word: self.trainSet.getMI(word, self.trainSet), reverse=True):
            self.features.append(key)
            count += 1
            if count == numFeatures:
               self.minMI = self.trainSet.getMI(key, self.trainSet)
               break
      else:
         for key in sorted(self.trainSet.getVocabulary(), key = lambda word: math.fabs(self.trainSet.getUniqueWeightOf(word)), reverse=True):
            self.features.append(key)
            count += 1
            if count == numFeatures:
               self.minMI = self.trainSet.getUniqueWeightOf(key)
               break
   def train(self, numFeatures = 500):
      self.findImportantFeatures(numFeatures)
      self.regressor = RFR()
      self.vectorizer = CountVectorizer(vocabulary = self.features, min_df = 1)
      strings = []
      Y = []
      for docKey in self.trainSet.getDocuments():
         document = self.trainSet.getDocument(docKey)
         strings.append(" ".join(document.getBagOfWords2("all")))
         Y.append(document.getSalary())
      X = self.vectorizer.fit_transform(strings).toarray()
      self.regressor.fit(X, Y)
   def predict(self, document):
      strings = []
      strings.append(" ".join(document.getBagOfWords2("all")))
      Z = self.vectorizer.fit_transform(strings).toarray()
      return self.regressor.predict(Z)[0]
class SVM(Classifier):
   def findImportantFeatures(self, numFeatures = 500):
      self.features = []
      count = 0
      for key in sorted(self.trainingSet.getVocabulary(), key = lambda word: self.trainingSet.getUniqueWeightOf(word), reverse=True):
         self.features.append(key)
         count += 1
         if count == numFeatures:
            break

   def train(self, numFeatures = 500):
      self.findImportantFeatures(numFeatures)
      self.classifier = svm.LinearSVC(C = 5.0, dual = True, verbose = 0)
      self.vectorizer = CountVectorizer(vocabulary = self.features, min_df = 1)
      strings = []
      Y = []
      for docKey in self.trainingSet.getDocuments():
         document = self.trainingSet.getDocument(docKey)
         strings.append(" ".join(document.getBagOfWords2("all")))
         Y.append(document.getGroup().getKey())
      X = self.vectorizer.fit_transform(strings)
      self.classifier.fit(X, Y)
   def classify(self, document):
      strings = []
      strings.append(" ".join(document.getBagOfWords2("all")))
      Z = self.vectorizer.fit_transform(strings)
      return self.classifier.predict(Z)[0]
   def classifyAll(self, testSet):
      for docKey in self.testSet.getDocuments():
         document = self.testSet.getDocument(docKey)
         strings.append(" ".join(document.getBagOfWords2("all")))
      Z = self.vectorizer.fit_transform(strings)
      return self.classifier.predict(Z)
def cal_product_description_tfidf():
    #PART II compute the tf-idf for product description
    print "\nBegins,compute the tf-idf for product description ..."
    product_description_data = pd.read_csv('product_descriptions.csv')

    print "\nMerge the product description into database..."
    AllSet = pd.merge( AllSet , product_description_data, how='left', on='product_uid')

    print "\nStemming the product description ..."
    AllSet['product_description'] = AllSet['product_description'].map(lambda x: stem_process(x))
    product_description=AllSet['product_description']

    print "\nGet the (product description vocabulary)-(search term) frequency matrix..."
    search_vect_descrip = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency
    search_vect_descrip.fit(product_description)#learn the vocabulary
    search_descrip_fq_matrix = search_vect_descrip.transform(search_term) #get the (product description vocabulary)-(search term) frequency matrix

    print "\nGet the (product description vocabulary)-(product_description) frequency matrix..."
    description_vect = CountVectorizer(stop_words ='english')
    description_vect.fit_transform(product_description)#learn the vocabulary
    description_fq_matrix=description_vect.transform(product_description) #get the (product discription vocabulary)-(product_description) frequency matrix

    print "\nGet the idf matrix..."
    tfidf_transformer = TfidfTransformer(norm="l2",smooth_idf=True)
    tfidf_transformer.fit(description_fq_matrix) # get idf for each vocabulary
    tf_idf_descrip_matrix  = tfidf_transformer.transform(description_fq_matrix) #get the idf matrix


    print "\nCompute the result of tf-idf for product description ..."
    tf_idf_descrip_result=[]#compute the result of tf-idf for product title
    for index in range(tf_idf_descrip_matrix.shape[0]):
        tf_idf_descrip_result.append((np.multiply(tf_idf_descrip_matrix[index], search_descrip_fq_matrix[index].transpose()))[0, 0])

    pd.DataFrame({"id":AllSet['id'],"product_description_tfidf": tf_idf_descrip_result}).to_csv('product_description_tfidf.csv', index=False)
Example #12
0
def main():
    twenty = fetch_20newsgroups()
    tfidf = TfidfVectorizer().fit_transform(twenty.data)
    cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-5:-1]
    print related_docs_indices
    print cosine_similarities[related_docs_indices]
    # vectorizer = CountVectorizer(min_df=1)
    # corpus = [
    # 'This is the first document.',
    # 'This is the second second document.',
    # 'And the third one.',
    # 'Is this the first document?',
    # ]

    # tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
    # tfs = tfidf.fit_transform(token_dict.values())

    train_set = ("The sky is blue.", "The sun is bright.")
    test_set = ("The sun in the sky is bright.",
                "We can see the shining sun, the bright sun.")
    count_vectorizer = CountVectorizer()
    count_vectorizer.fit_transform(train_set)
    print "Vocabulary:", count_vectorizer.vocabulary
    # Vocabulary: {'blue': 0, 'sun': 1, 'bright': 2, 'sky': 3}
    freq_term_matrix = count_vectorizer.transform(test_set)
    print freq_term_matrix.todense()
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(freq_term_matrix)
    print "IDF:", tfidf.idf_
    tf_idf_matrix = tfidf.transform(freq_term_matrix)
    print tf_idf_matrix.todense()
Example #13
0
class NERKNNClassifier(NERClassifier):
    def __init__(self, recbysns):
        NERClassifier.__init__(self, recbysns)
        self.knn = None
        self.vectorizer = None

    def train(self, entities):
        self.knn = KNeighborsClassifier(n_neighbors=10)
        self.vectorizer = CountVectorizer(
            analyzer=NEREntityAnalyzer(self.recbysns), max_df=1.0, min_df=2)
        self.vectorizer.fit_transform(entities)
        X = [self.generate_features(entity)
             for entity in entities if entity.pos() == u'title']
        Y = [entity.ner_class()
             for entity in entities if entity.pos() == u'title']
        self.knn = self.knn.fit(X, Y)

    def predict(self, entity):
        if entity.pos() == u'url':
            return NER_VIDEO
        else:
            X = [self.generate_features(entity)]
            return self.knn.predict(X)[0]

    def generate_features(self, entity):
        text_features = self.vectorizer.transform([entity]).\
                        toarray()[0].tolist()
        features = entity.features().values()
        return text_features + features
class SVMRegressor(Regressor):
   def findImportantFeatures(self, numFeatures = 1000):
      #Selecting the important features
      self.features = []
      count = 0
      for key in sorted(self.trainSet.getVocabulary(), key = lambda word: self.trainSet.getUniqueWeightOf(word), reverse=True):
         count += 1
         self.features.append(key)
         if count == numFeatures:
            break
   def train(self, numFeatures = 1000):
      self.findImportantFeatures(numFeatures)
      self.vectorizer = CountVectorizer(vocabulary = self.features,min_df = 1)
      self.regressor = SVR(kernel='linear', C=25, epsilon=10)
      strings = []
      Y = []
      for docKey in self.trainSet.getDocuments():
         document = self.trainSet.getDocument(docKey)
         strings.append(" ".join(document.getBagOfWords2("all")))
         Y.append(document.getSalary())
      X = self.vectorizer.fit_transform(strings)
      self.regressor.fit(X,Y)
      Coef = self.regressor.coef_
      coef_list = Coef.toarray()
      #for i in range(len(coef_list[0])):
      #   if math.fabs(coef_list[0][i]-0.0) > 0.1:
      #      print self.features[i],coef_list[0][i]


   def predict(self, document):
      strings = []
      strings.append(" ".join(document.getBagOfWords2("all")))
      Z = self.vectorizer.fit_transform(strings)
      return self.regressor.predict(Z)[0]
Example #15
0
def tfidf_step_by_step():
    """ Example of calculating TF-IDF for OSM nodes.
    Document is a list of keys.
    """

    learn_data_set = documents_gen()
    test_data_set = documents_gen()

    # calculate term-frequency
    vectorizer = CountVectorizer(stop_words=stop_words,
        token_pattern='[a-z0-9_\-:]+')
    vectorizer.fit_transform(learn_data_set)
    #pprint.pprint(vectorizer.vocabulary_)

    # freq_term_matrix is a sparse matrix (elemens stored in Coordinate format
    # http://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_.28COO.29 )
    freq_term_matrix = vectorizer.transform(test_data_set)
    # freq_term_matrix.todense()

    # l2 - Euclidean normalization
    # http://en.wikipedia.org/wiki/Norm_%28mathematics%29#Euclidean_norm
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(freq_term_matrix)

    tf_idf = tfidf.transform(freq_term_matrix)

    pprint.pprint(tf_idf.todense())
    def action(self, tweets_list):
        corpus = []
        for tweet in tweets_list:
            #corpus += [t["text"]]
            tweet_str = tweet["text"].encode("utf-8")
            tweet_str = unicode(tweet_str,'utf-8')
            corpus.append(tweet_str)

        print(corpus)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(corpus)
        M,P=X.shape


        dist_corpus=euclidean_distances(X)

        stwf=stopwords.words('french')
        stwf.append('les')
        vectorizer=CountVectorizer(stop_words=stwf)
        X = vectorizer.fit_transform(corpus)
        dico=vectorizer.vocabulary_
        
        #Tous les print regroupés ici
        
        print("Results of Birch algorithm")

        clusters = birch_algo(X.toarray(), None)
        quit()
def cal_product_title_tfidf():

    #PART I compute the tf-idf for product title
    print "\nBegins,compute the tf-idf for product title ..."


    print "\nStemming product_title..."
    AllSet['product_title'] = AllSet['product_title'].map(lambda x : stem_process(x))
    product_title = AllSet['product_title']

    print "\nGet the (product title vocabulary)-(search term) frequency matrix..."
    search_vect_tittle = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency
    search_vect_tittle.fit(product_title)#learn the vocabulary
    search_tittle_fq_matrix = search_vect_tittle.transform(search_term) #get the (product title vocabulary)-(search term) frequency matrix

    print "\nGet the (product title vocabulary)-(product_title) frequency matrix"
    title_vect = CountVectorizer(stop_words='english')
    title_vect.fit_transform(product_title)#learn the vocabulary
    title_fq_matrix = title_vect.transform(product_title) #get the (product title vocabulary)-(product_title) frequency matrix

    print "\nGet the idf matrix"
    tfidf_transformer = TfidfTransformer(norm="l2", smooth_idf=True)
    tfidf_transformer.fit(title_fq_matrix) # get idf for each vocabulary
    tf_idf_title_matrix = tfidf_transformer.transform(title_fq_matrix) #get the idf matrix

    print "\nCompute the result of tf-idf for product title ..."
    tf_idf_title_result = [] #compute the result of tf-idf for product title
    for index in range(tf_idf_title_matrix.shape[0]):
        tf_idf_title_result.append((np.multiply(tf_idf_title_matrix[index], search_tittle_fq_matrix[index].transpose()))[0, 0])

    pd.DataFrame({"id": AllSet['id'],"product_title_tfidf": tf_idf_title_result}).to_csv('product_title_tfidf.csv', index=False)

    return 0
def improveVocabulary(positiveDocuments, negativeDocuments):
    countVectPos = CountVectorizer(min_df = 0.1, stop_words = 'english')
    countVectNeg = CountVectorizer(min_df = 0.1, stop_words = 'english')
    positiveCandidates = []
    negativeCandidates = []
    if len(positiveDocuments) > 0:
        try:
            countVectPos.fit_transform(positiveDocuments)
            positiveCandidates = countVectPos.get_feature_names()
        except:
            a = 1
            #print "count vector failed"
    if len(negativeDocuments) > 0:
        try:
            countVectNeg.fit_transform(negativeDocuments)
            negativeCandidates = countVectNeg.get_feature_names()
        except:
            a = 1
            #print "countvector failed"
    global listPos, listNeg, countDictPos, countDictNeg
    #pdb.set_trace()
    for candidate in (positiveCandidates + negativeCandidates):
        score = (getMapOutput(countVectPos.vocabulary_, candidate) - getMapOutput(countVectNeg.vocabulary_, candidate))
        if (score > 0 and  score/getMapOutput(countVectPos.vocabulary_, candidate) >= 0.1):
            insertMap(listPos, candidate)
        elif (score < 0 and  abs(score)/getMapOutput(countVectNeg.vocabulary_, candidate) >= 0.1):
            insertMap(listNeg, candidate)
    '''
Example #19
0
def make_week1_plot(df):


    vectorizer = CountVectorizer(stop_words='english',
                                 ngram_range=(1, 1),
                                 token_pattern='[A-Za-z]+')
    features = vectorizer.fit_transform(df.ingredient_txt)
    ## features is a document x term matrix.

    wc = feature_counts(vectorizer, features)

    ## plot of most common words:
    p1 = wc.sort('count').tail(20).plot('word','count', kind='bar')

    v2 = CountVectorizer(stop_words=get_stop_words(),
                         ngram_range=(1, 1),
                         token_pattern='[A-Za-z]+')
    f2 = v2.fit_transform(df.ingredient_txt)
    ## features is a document x term matrix.

    wc2 = feature_counts(v2, f2)

    ## plot of most common words:
    n = 50

    plt.figure(1)
    plt.subplot(211)
    p1 = wc.sort('count').tail(n).plot('word','count', kind='bar')

    plt.subplot(212)
    p2 = wc2.sort('count').tail(n).plot('word','count', kind='bar')

    plt.tight_layout()
    plt.savefig('fig-word-count-histograms.png')
Example #20
0
def prepare_data(test_train_split, train_features_file, test_features_file):
    vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 1000) 
    song_id = 0
    clean_train_songs=[]
    clean_test_songs=[]
    clean_train_labels=[]
    clean_test_labels=[]
    for genre in os.listdir(dirname):
        print("\nProcessing Genre: " + genre)
        songs = os.listdir(os.path.join(dirname, genre))
        num_songs = len(songs)
        song_index = 0 # Index of song within this genre
        for song in songs:
	    print(song)
            with open(os.path.join(dirname, genre, song), 'r') as song_lyrics:
                lyrics = song_lyrics.read()
                words = process_song(lyrics)
		print 'NumSongs: %d SongIndex %d SongId %d' % (num_songs, song_index, song_id)
                if (song_index  + 1) <= test_train_split * num_songs:
			clean_train_songs.append(words)
			clean_train_labels.append(genre)
		else:
			clean_test_songs.append(words)
			clean_test_labels.append(genre)
                    
            song_index = song_index + 1
            song_id = song_id + 1
    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of 
    # strings.
    train_data_features = vectorizer.fit_transform(clean_train_songs)
    test_data_features = vectorizer.fit_transform(clean_test_songs)

    # Numpy arrays are easy to work with, so convert the result to an 
    # array
    train_data_features = train_data_features.toarray()
    test_data_features = test_data_features.toarray()
    print train_data_features.shape
    print test_data_features.shape

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators = 100) 

    # Fit the forest to the training set, using the bag of words as 
    # features and the sentiment labels as the response variable
    #
    # This may take a few minutes to run
    forest = forest.fit( train_data_features, clean_train_labels )
    
    # Use the random forest to make sentiment label predictions
    result = forest.predict(test_data_features)
    iteration=0
    acc=0
    for x in result:
	if (x==clean_test_labels[iteration]):
		acc=acc+1
	iteration=iteration+1
    print 'Accuracy=%f' % (acc*1.0/len(result))
def makeFeatures():

    connection = happybase.Connection(MACHINE + '.vampire', table_prefix=VUID)
    table = connection.table(DCOG_TABLE)
    f_table = connection.table(DCOG_F_TABLE)
    genre_data = []
    style_data = []
    
    keys = []

    for key,d in table.scan():
        data = json.loads(d.itervalues().next())
        genres = data['genres']
        styles = data['styles']
        
        if (genres):
            genre_data.append(genres)
        else:
            genre_data.append(' ')
            
        if (styles):
            style_data.append(styles)
        else:
            style_data.append(' ')

        keys.append(key)

    # Vectorize genre word counts
    g_vectorizer = CountVectorizer(analyzer = "word",   \
                                   tokenizer = None,    \
                                   preprocessor = None, \
                                   stop_words = None) 
    
    g_features = g_vectorizer.fit_transform(genre_data)
    g_features = g_features.toarray()

    # Vectorize style word counts
    s_vectorizer = CountVectorizer(analyzer = "word",   \
                                   tokenizer = None,    \
                                   preprocessor = None, \
                                   stop_words = None)
    
    s_features = s_vectorizer.fit_transform(style_data)
    s_features = s_features.toarray()

    # Create Key Vector 
    k_arr = np.array(keys)
    k_arr.shape = (-1, 1)

    features = np.concatenate((k_arr, g_features, s_features), axis=1)


    b = f_table.batch()
    for row in features:
        data = row[1:]
        data = list(data.astype(int))
        b.put(row[0], {DCOG_F_COLUMN_FAMILY + ':' + DCOG_F_COLUMN : json.dumps(data)})
   
    b.send()
Example #22
0
def train_test(args):
    
    # unpack arguments and make train/test data/label dicts/lists
    train, test, features, classifier = args

    # create tf idf spare matrix from training data
    if features == 'tfidf':
        fe = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_features=1290)
        trainfe = fe.fit_transform(train['data'])
    elif features == 'dict':
        fe = CountVectorizer(tokenizer=tokenize, stop_words='english', binary=True)
        trainfe = fe.fit_transform(train['data'])
    elif features == 'lsa':
        svd = TruncatedSVD(n_components=100, random_state=42)
        fe = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_df=0.115, max_features=11500)
        trainfe = svd.fit_transform(fe.fit_transform(train['data']))
    elif features == 'rule':
        hamfe = CountVectorizer(tokenizer=tokenize, stop_words='english', max_features=1150)
        spamfe = CountVectorizer(tokenizer=tokenize, stop_words='english', max_features=1150)
        hamfit = hamfe.fit_transform(train['data'].loc[train['labels'] == 0])
        spamfit = spamfe.fit_transform(train['data'].loc[train['labels'] == 1])

    # train multinomial nb classifier on training data
    if classifier == 'mnb':
        from sklearn.naive_bayes import MultinomialNB
        clf = MultinomialNB().fit(trainfe, train['labels'])
    elif classifier == 'gnb':
        from sklearn.naive_bayes import GaussianNB
        clf = GaussianNB().fit(trainfe.toarray(), train['labels'])
    elif classifier == 'svm':
        from sklearn.linear_model import SGDClassifier
        clf = SGDClassifier(loss='squared_hinge', penalty='l2').fit(trainfe, train['labels'])
    elif classifier == 'log':
        from sklearn.linear_model import SGDClassifier
        clf = SGDClassifier(loss='log', penalty='l2').fit(trainfe, train['labels'])
    elif classifier == 'rule':
        hamfeats = hamfe.transform(test['data'])
        spamfeats = spamfe.transform(test['data'])
        hyp = np.array(hamfeats.sum(axis=1) < spamfeats.sum(axis=1)).reshape(-1).T
        
    # extract features from test data
    if features == 'lsa':
        feats = svd.transform(fe.transform(test['data']))
    else:
        feats = fe.transform(test['data'])
    # use trained classifier to generate class predictions from test features
    if classifier == 'gnb':
        hyp = clf.predict(feats.toarray())
    elif classifier == 'rule':
        pass
    else:
        hyp = clf.predict(feats)

    # compare predictions with test labels
    score = np.mean(hyp == test['labels'])

    return score
    def test_transformation(self):

        #TODO: Remove this function. Useless now

        train_set = ("The sky is blue.", "The sun is bright.")
        #test_set = ("The sun in the sky is bright.",
                    #"We can see the shining sun, the bright sun.")
        count_vectorizer = CountVectorizer('english')
        count_vectorizer.fit_transform(train_set)
Example #24
0
def get_features(text):
    count_vect = CountVectorizer(analyzer='char_wb', ngram_range=(1, 5), min_df=1,
                                 vocabulary=unique)
    if isinstance(text, list):
        x_train_counts = count_vect.fit_transform([abstract.lower() for abstract in text])
    else:
        x_train_counts = count_vect.fit_transform([text.lower()])
    tfidf_transformer = TfidfTransformer()
    return tfidf_transformer.fit_transform(x_train_counts)
def vocabulary(text):
    count = CountVectorizer(analyzer='word',ngram_range=(1,1),stop_words='english')
    countTotal = CountVectorizer(analyzer='word',ngram_range=(1,1))
    counter = count.fit_transform([text]).toarray()
    countT = countTotal.fit_transform([text]).toarray()
    matrix = np.zeros((1, 1))
    matrix[0, 0] = (countT.sum()-counter.sum())/float(countT.sum())

    return matrix
Example #26
0
def setup(train, test, binaryOpt = False):
    count_vectorizer = CountVectorizer(binary = binaryOpt)
    count_vectorizer.fit_transform(train)
    freq_term_matrix = count_vectorizer.transform(test)
    if binaryOpt:
        return freq_term_matrix
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(freq_term_matrix)
    tf_idf_matrix = tfidf.transform(freq_term_matrix)
    return tf_idf_matrix
 def bagofword(self):
     count_vectorizer = CountVectorizer()
     if self.all_wordlist:
         feature_vectors = count_vectorizer.fit_transform(self.all_wordlist)
         print(feature_vectors.toarray())
         self.all_bagofwords = feature_vectors
     if self.wordlist:
         feature_vectors = count_vectorizer.fit_transform(self.wordlist)
         print(feature_vectors.toarray())
         self.bagofwords = feature_vectors
def to_features(tweet):
    stop_words = ['iphone', 'ipod', 'ipad', 'mac', 'imac', 'rt', 'apple', 'amp']
    stop_words = ENGLISH_STOP_WORDS.union(stop_words)
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 2), stop_words=stop_words)
    tweet = rm_usernames(rm_links(tweet))
    try:
        vectorizer.fit_transform([tweet])
        return vectorizer.get_feature_names()
    except ValueError:
        return ['']
Example #29
0
def tfidf_normalize(articles_with_id):
    global NON_STOPWORD_LIMIT
    stemmed_articles_with_id = [(aid, stem_article(article)) for (aid, article) in articles_with_id]
    stemmed_articles = [article for (aid, article) in stemmed_articles_with_id]
    # test_set = train_set
    # instantiate vectorizer with English language, using stopwords and set min_df, max_df parameters and the tokenizer
    vectorizer = CountVectorizer(stop_words="english", min_df=3, max_df=0.1, token_pattern=r"\b[a-zA-Z][a-zA-Z]+\b")
    # by appling the vectorizer instance to the train set
    # it will create a vocabulary from all the words that appear in at least min_df and in no more than max_df
    # documents in the train_set
    vectorizer.fit_transform(stemmed_articles)
    # vectorizer transform will apply the vocabulary from the train set to the test set. In my case,
    # they are the same set: whole Wikipedia.
    # this means that each article will get representation based on the words from the vocabulary and
    # their TF-IDF values in the Scipy sparse output matricx
    freq_term_matrix = vectorizer.transform(stemmed_articles)
    long_articles_with_id = []
    assert freq_term_matrix.shape[0] == len(articles_with_id)
    for (i, article_with_id) in zip(xrange(freq_term_matrix.shape[0]), stemmed_articles_with_id):
        row = freq_term_matrix.getrow(i)
        if row.getnnz() >= NON_STOPWORD_LIMIT:
            long_articles_with_id.append(article_with_id)

    long_articles = [article for (aid, article) in long_articles_with_id]

    vectorizer = CountVectorizer(stop_words="english", min_df=3, max_df=0.1, token_pattern=r"\b[a-zA-Z][a-zA-Z]+\b")
    vectorizer.fit_transform(long_articles)

    freq_term_matrix = vectorizer.transform(long_articles)

    # Gabrilovich says that they threshold TF on 3 (remove word-article association if that word
    # does not appear at least 3 times in that single article
    # freq_term_matrix.data *= freq_term_matrix.data>=3
    # freq_term_matrix.eliminate_zeros() # I think this is not necessary...
    # this is a log transformation as applied in (Gabrilovich, 2009), i.e., that is
    # how he defines TF values. In case of TF = 0, this shall not affect such value
    # freq_term_matrix.data = 1 + np.log( freq_term_matrix.data )
    # instantiate tfidf trnasformer
    tfidf = TfidfTransformer(norm=None, smooth_idf=False, sublinear_tf=True)
    # tfidf uses the freq_term_matrix to calculate IDF values for each word (element of the vocabulary)
    tfidf.fit(freq_term_matrix)
    # finally, tfidf will calculate TFIDF values with transform()
    tf_idf_matrix = tfidf.transform(freq_term_matrix)
    # tf_idf_matrix.data = np.log(np.log(tf_idf_matrix.data))
    tf_idf_matrix = normalize(tf_idf_matrix, norm="l2", axis=0, copy=False)
    # now we put our matrix to CSC format (as it helps with accessing columns for inversing the vectors to
    # words' concept vectors)
    tf_idf_matrix = tf_idf_matrix.tocsc()
    # we need vocabulary_ to be accessible by the index of the word so we inverse the keys and values of the
    # dictionary and put them to new dictionary word_index
    word_index = dict((v, k) for k, v in vectorizer.vocabulary_.iteritems())
    M, N = tf_idf_matrix.shape
    print "Articles: ", M
    print "Words: ", N
    return tf_idf_matrix, word_index, long_articles_with_id
Example #30
0
class TextSimilarity(object):
    '''
    classdocs
    '''

    def __init__(self,max_ngram=2,needStem=False):
        '''
        Constructor
        '''
        self.stemmer=PorterStemmer()
        if not needStem:
            self.vectorizer=CountVectorizer(stop_words = 'english',ngram_range=(1,max_ngram))
        else:
            self.vectorizer=CountVectorizer(analyzer=self.AnalyseText,ngram_range=(1,max_ngram))
        self.stop = stopwords.words('english')
        
    
    def get_cos_similarity(self,text1,text2):
        tfidf=self.vectorizer.fit_transform([text1,text2])
        cos_sim=cosine_similarity(tfidf[0], tfidf[1])[0][0]
        return cos_sim    
    
    def calculate_TF(self,sents):
        all_finished=True
        for sent in sents:
            if not hasattr(sent, 'tf'):
                all_finished=False
                break
        if all_finished: return 
        texts=[sent.content for sent in sents]
        vectors=self.get_count_vector(texts)
        for i in xrange(len(sents)):
            sents[i].tf=vectors[i]
    
    def calculate_sentence_similarity(self,sent1,sent2):
        return self.get_similarity_from_vectors(sent1.tf, sent2.tf)
    
    def get_count_vector(self,texts):
        vectors=self.vectorizer.fit_transform(texts)
        return vectors
    
    def get_similarity_from_vectors(self,vector1,vector2):
        sim=cosine_similarity(vector1, vector2)[0][0]
        return sim
        
    def AnalyseText(self,doc):
        doc=doc.lower()
        doc=re.sub(r'[^a-z\d\s]',' ',doc)
        doc=re.sub(r'\d','#',doc)
        tokens=doc.split()
        stems=[]
        for t in tokens:
            if len(t)<2 or t in self.stop: continue
            stems.append(self.stemmer.stem(t))
        return stems
Example #31
0
class Lsa(BaseModel):
    results_folder = Hyper.lsa_result

    def __init__(self, n_components=300):
        super(Lsa, self).__init__()

        self.svd = TruncatedSVD(n_components, random_state=42)
        self.vectorizer = CountVectorizer()

    @staticmethod
    def load():
        with open(Hyper.lsa_pickle, 'rb') as fd:
            return pickle.load(fd)

    def save(self):
        with open(Hyper.lsa_pickle, 'wb') as fd:
            pickle.dump(self, fd, protocol=4)

    def fit(self):
        print('create counter')
        new_corpus = self.vectorizer.fit_transform(self.corpus)

        print('fit lsa')
        self.svd.fit(new_corpus)

        print('save lsa')
        self.save()

    def process(self):
        with open(Hyper.processed_queries) as fd:
            queries = fd.readlines()

        query_to_docs = collections.defaultdict(list)
        with open(Hyper.sample_submission) as fd:
            fd.readline()
            for line in fd:
                line = line.strip().split(',')
                query_to_docs[int(line[0]) - 1].append(int(line[1]) - 1)

        queries_vec = list(map(lambda x: x.strip().split('\t')[1], queries))
        queries_vec = self.vectorizer.transform(queries_vec)
        corpus_vec = self.vectorizer.transform(self.corpus)

        queries_vec = self.svd.transform(queries_vec)
        corpus_vec = self.svd.transform(corpus_vec)

        model_results = ModelResult([])
        for query_id, doc_ids in tqdm.tqdm(query_to_docs.items()):
            sim = cosine_similarity(queries_vec[query_id].reshape(1, -1), corpus_vec[doc_ids])

            query_result = QueryResult(int(query_id) + 1, [])
            for i, doc_id in enumerate(doc_ids):
                doc_result = DocScore(doc_id + 1, sim[0][i])
                query_result.results.append(doc_result)

            model_results.queries.append(query_result)

        print('save results to {}'.format(self.results_folder))
        with open(self.results_folder, 'w') as fd:
            for query in model_results.queries:
                for doc in query.results:
                    fd.write('{}\t{}\t{}\n'.format(query.id, doc.id, doc.score))

        return model_results
Example #32
0
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from time import time
from collections import Counter

train, test = train_test_split(data, test_size=0.3, random_state=42)

train_clean_tweet = []
for tweets in train['text']:
    train_clean_tweet.append(tweets)
test_clean_tweet = []
for tweets in test['text']:
    test_clean_tweet.append(tweets)

v = CountVectorizer(analyzer="word")
train_features = v.fit_transform(train_clean_tweet)
test_features = v.transform(test_clean_tweet)

Classifiers = [
    LogisticRegression(C=0.000000001, solver='liblinear', max_iter=200),
    KNeighborsClassifier(3),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=200),
    AdaBoostClassifier(),
    GaussianNB()
]

dense_features = train_features.toarray()
dense_test = test_features.toarray()
Accuracy = []
Model = []
    end_time = time.time()
    Tools.flushPrint("tokenized in {} seconds".format(end_time - start_time))

    start_time = time.time()
    vectorizer = CountVectorizer(
        # so we can pass it strings
        input='content',
        # turn off preprocessing of strings to avoid corrupting our keys
        lowercase=False,
        preprocessor=lambda x: x,
        # use our token dictionary
        tokenizer=lambda x: x,
        min_df=3,
        max_df=0.8,
        max_features=20000)
    vectorized_data = vectorizer.fit_transform(tokenized_data)
    # Tools.flushPrint(vectorized_data[:10])
    del tokenized_data
    end_time = time.time()
    Tools.flushPrint("vectorized in {} seconds".format(end_time - start_time))

    start_time = time.time()
    n_topics = 50
    lda = trainLDA(vectorized_data, n_topics, max_iterations)
    Tools.flushPrint(lda)
    try:
        Tools.dillDump(os.path.join(output_data_dir, "lda_tag.pkl"), lda)
    except:
        pass
    end_time = time.time()
    Tools.flushPrint("Trained LDA in {} Seconds".format(end_time - start_time))
Example #34
0

#Opening train and test file from tsv with separator \t
train= pd.read_csv("train.tsv", sep="\t")
test= pd.read_csv("test.tsv", sep="\t")

#Converting the reviews into a count matrix
#The feature is Bag of Words with tokenization, removing tag </br>, n gram, mark_negation
count_vector = CountVectorizer(analyzer="word",                                         #prerequisite from preprocessor function
                            tokenizer=lambda text: mark_negation(word_tokenize(text)),  #to override  the tokenization with marking negation
                            preprocessor=lambda text: text.replace("<br />", " "),      #to override the preprocessing with replacing tag br with empty character
                            ngram_range=(1, 3),                                         #obtaining the combination of term as unigram, bigram and trigram
                            )

#Fit to data and transform it
train_counts = count_vector.fit_transform(train['Phrase'])

#Setting tf (without idf) and learn the idf vector
tf_transformer = TfidfTransformer(use_idf=False).fit(train_counts)

#Transforming a count matrix to a tf representation
train_tf = tf_transformer.transform(train_counts)

tfidf_transformer = TfidfTransformer()

#Fit to data and transform it
train_tfidf = tfidf_transformer.fit_transform(train_counts)

#Using Linear Support Vector Classification classifier for fit to data then transform it
classifier = svm.LinearSVC().fit(train_tfidf, train['Sentiment'])
Example #35
0
def fichier_rec(myDirectory):

    for f in listdir(myDirectory):
        chemin = join(myDirectory, f)
        if isfile(chemin):
            with open(chemin, 'rb') as file:
                content = ''
                for line in file:
                    word = str(line).split(" ")
                    for m in word:
                        if not (pattern.match(str(m))):
                            content += str(m) + " "
            documents = [content]

            cv = CountVectorizer(stop_words="english")

            count_vector = cv.fit_transform(documents)

            #sort the counts of first book title by descending order of counts
            sorted_items = sort_coo(count_vector[0].tocoo())

            #Get feature names (words/n-grams). It is sorted by position in sparse matrix
            feature_names = cv.get_feature_names()
            n_grams = extract_topn_from_vector(feature_names, sorted_items, 20)

            listCat = {}

            with open(LaCateg) as json_file:
                if not exists(monRep):
                    mkdir(monRep)

                dataCateg = json.load(json_file)

                for key in dataCateg.keys():
                    if not exists(monRep + "/" + key):
                        mkdir(monRep + "/" + key)

                    if not exists(monRep + "/Other"):
                        mkdir(monRep + "/Other")

                    listCat[key] = 0

                for key in dataCateg.keys():
                    for i in n_grams:
                        if i[0] in dataCateg[key]:
                            listCat[key] += i[1]

            cpt = 0
            k = ""
            for j in listCat:
                if cpt < listCat[j]:
                    cpt = listCat[j]
                    k = j

            if k == "":
                k = "Other"
            """with open(allVariables.pathToProg + "/class.txt", "a") as clas:
                clas.write("fichier: " + str(chemin.split("\\")[-1]) + "\n")
                clas.write("listCat: " + str(listCat) + "\n")
                clas.write("mot: " + str(n_grams) + "\n")
                clas.write(k + "\n\n")"""

            copyfile(
                chemin, monRep + "/" + k + "/" + basename(chemin) + "-" +
                myDirectory.split("\\")[-1])
        else:
            print(chemin)
            fichier_rec(chemin)
wordcloud_pos_in_pos = WordCloud(background_color='black',
                                 width=1800,
                                 height=1400).generate(corpus_m_pos_in_pos)

plt.imshow(wordcloud_pos_in_pos)

# Unique words
unique_words = list(set(" ".join(corpus_m_words).split(" ")))

#################################################

#################################################

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=5, ngram_range=(1, 3))
X1 = vectorizer.fit_transform(words_tokens)
features = list(vectorizer.get_feature_names())

counter = Counter(features)
counter = Counter(counter)
counter.most_common(20)
# convert list of tuples into data frame
freq_df_b = pd.DataFrame.from_records(counter.most_common(20),
                                      columns=['bigram', 'Count'])

#Creating a bar plot
freq_df_b.plot(kind='bar', x='bigram', figsize=(15, 10), fontsize=15)

vect = CountVectorizer(min_df=5, ngram_range=(1, 2)).fit(words_tokens)

X1 = vect.transform(words_tokens)
Example #37
0
print()
places_df['des'] = places_df[['des', 'ktop:category']].apply(lambda x:
                                                             (' ').join(x),
                                                             axis=1)
places_df['lab'] = places_df[['lab', 'rdf:type']].apply(lambda x:
                                                        (' ').join(x),
                                                        axis=1)
print(places_df['lab'])
print(places_df['des'])
print("==문자열로 재 변환 완료==")

count_vect = CountVectorizer(min_df=0, ngram_range=(1, 2))
print(count_vect)

print("==유사도 벡터화 작업 중==")
cat_mat1 = count_vect.fit_transform(places_df['des'])
cat_mat2 = count_vect.fit_transform(places_df['lab'])

print("cat_mat1",
      cat_mat1.shape,
      cat_mat1,
      "cat_mat2",
      cat_mat2.shape,
      cat_mat2,
      sep="\n")

cat_sim1 = cosine_similarity(cat_mat1, cat_mat1)
cat_sim2 = cosine_similarity(cat_mat2, cat_mat2)
cat_sim1 *= 0.9
cat_sim2 *= 0.1
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)
Example #39
0
                      max_words=100,
                      max_font_size=50,
                      random_state=42).generate(str(corpus))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
fig.savefig("social.png", dpi=900)

from sklearn.feature_extraction.text import CountVectorizer
import re
cv = CountVectorizer(stop_words=stop_words,
                     max_features=10000,
                     ngram_range=(1, 3))
X = cv.fit_transform(corpus)
list(cv.vocabulary_.keys())[:10]


#Most frequently occuring words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]


#Convert most freq words to dataframe for plotting bar plot
Example #40
0
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv('final_data(movies 1950-2020).csv')
movies_name = list(df['movie_title'])
df['comb'] = df['actor_1_name'] + ' ' + df['actor_2_name'] + ' ' + df[
    'actor_3_name'] + ' ' + df['director_name'] + ' ' + df['genres']
df['comb'] = df['comb'].fillna('unknown')
df['genres'] = df['genres'].replace('None', ' ')
df.set_index('movie_title', inplace=True)

# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['comb'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(df.index)

cosine_sim = cosine_similarity(count_matrix)
# function that takes in movie_title as input and returns the top 10 recommended movies


def recommendations(title):

    recommended_movies = []
    ratings = []
    title = title.lower()
    # getting the index of the movie that matches the title
Example #41
0
import data_scan
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
import numpy as np
import pandas
"""using sklern for the bernoulli naive bayes to test my implementation's accuracy"""

#import the polished data
X_train, X_test, y_train, y_tests = data_scan.main_test()
features = 10000
vect = CountVectorizer(max_features=features, binary=True)
X_train_vectorized = vect.fit_transform(X_train)
X_train_vectorized_array = X_train_vectorized.toarray()
X_test_vectorized = vect.fit_transform(X_test)
X_test_vectorized_array = X_test_vectorized.toarray()

#vetrorizing the vords - bag of words

#logistic regression
bernoulli = BernoulliNB().fit(X_train_vectorized, y_train)
#prediction
prediction = bernoulli.predict(X_test_vectorized)  # return predicted y

print(metrics.accuracy_score(prediction, y_tests))
#writing out the data
#np.savetxt("submission.csv", np.column_stack((kaggle_files, kaggle_label)), delimiter=",")
Example #42
0
        })

    # print data_clean[0]

    data_words_only = []

    for article in data_clean:
        data_words_only.append(" ".join(
            article["body"]))  # glue all words together into a list of strings

    vectorizer = CountVectorizer(analyzer="word",
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words='english')

    _features = vectorizer.fit_transform(data_words_only)

    _features_array = _features.toarray()

    print "Got all features...", _features_array.shape

    # ----------------------------------------

    sim1 = min_hash(10, _features_array, data_clean)

    buckets = {}
    print len(sim1)

    for id in sim1:
        similarities_for_one_id = sim1[id]
        sims = []
Example #43
0
def print_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
         #print(" ".join([feature_names[i]
          #              for i in topic.argsort()[:-n_top_words - 1:-1]]))
         topicList.append(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]).encode('utf-8'))
    print("Deepak printing **************")
    print(topicList)
    print()
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=20, max_features=n_features,
                                stop_words='english')
t0 = time()
corpus = featureContents
tf = tf_vectorizer.fit_transform(corpus)
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with tf-idf features,"
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tf)
#exit()
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)
Example #44
0
        data.append(temp)
        file.close()
        if path == pos_path:
            class_id.append(1)
        else:
            class_id.append(0)
    return data


# the training data consisits of 5 positive reviews and five negative reviews
train_data = read_in_files(all_neg, neg_path, 0, 20, train_class)
train_data += read_in_files(all_pos, pos_path, 0, 20, train_class)

x_train, x_test, y_train, y_test = train_test_split(train_data,
                                                    train_class,
                                                    test_size=0.30)

vectorizer = CountVectorizer(stop_words='english')
train_features = vectorizer.fit_transform([r for r in x_train])
test_features = vectorizer.transform([r for r in x_test])

logreg = LogisticRegression()
logreg.fit(train_features, y_train)
y_pred = logreg.predict(test_features)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(
    logreg.score(test_features, y_test)))

confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
print(classification_report(y_test, y_pred))
Example #45
0
    messages = []
    categories = []

    # train_files = ['train/data.json']
    train_files = glob.glob('train/*.json')
    for input_json in train_files:
        with open(input_json, 'r') as f:
            lines = json.loads(f.read())

        for line in lines:
            messages.append(line['test_message'])
            categories.append(line['reason_id'])

    # vectorize
    count_vector = CountVectorizer(tokenizer=Token.token)
    X_train_counts = count_vector.fit_transform(messages)
    # print(count_vector.vocabulary_)

    # tf-idf
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    clf = Classifier.naive_bayes(X_train_tfidf, categories)

    test_json = 't.json'
    fail_messages = []
    with open(test_json, 'r') as f:
        lines = json.loads(f.read())

    for line in lines:
        fail_messages.append(line['test_message'])
    review = re.sub(r'\W', ' ', str(X[i]))
    review = review.lower()
    review = re.sub(r'^br$', ' ', review)
    review = re.sub(r'\s+br\s+', ' ', review)
    review = re.sub(r'\s+[a-z]\s+', ' ', review)
    review = re.sub(r'^b\s+', '', review)
    review = re.sub(r'\s+', ' ', review)
    corpus.append(review)

# Creating the BOW model
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=2234,
                             min_df=3,
                             max_df=0.6,
                             stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()
B = X

# Creating the Tf-Idf Model
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X = transformer.fit_transform(X).toarray()

# Creating the Tf-Idf model directly
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=2234,
                             min_df=3,
                             max_df=0.6,
                             stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()
    for w in words:
        if (w not in stop_words):
            new_words.append(w)

    after_stem_words = []
    for w in new_words:
        after_stem_words.append(ps.stem(w))
    clean_msg = ' '.join(after_stem_words)
    return clean_msg


df['Review'] = df.Review.apply(clean_text)
print('data cleaned...')
# df.Liked.value_counts().plot(kind='bar')

X = cv.fit_transform(df.Review).toarray()
new_X = pca.fit_transform(X)
y = df.iloc[:, -1].values
print('going for training...')
log.fit(new_X, y)
print('model trained....')

# def graph():
#     a=df.Liked.value_counts().plot(kind='bar')
#     l4.configure(a)

root = Tk()
root.state('zoomed')
root.configure(background='gray85')
root.title("Restaurant Reviews Project")
Example #48
0
############################################################
##                        Model設置                        ##
############################################################
with open("E:/AB104/AlgorithmTest/Jieba_Booking.json", 'r') as a:
    data = json.load(a)

data = DataFrame(data)
classifier = MultinomialNB()
X_train, X_test, y_train, y_test = train_test_split(data['comments'].values,
                                                    data['mark'].values,
                                                    test_size=0)

targets = y_train
# print len(targets) #241221
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(X_train)
# print len(X_train) #241221
classifier.fit(counts, targets)

############################################################
##                    進行檢測之結果儲存                     ##
############################################################
commList_Jieba_marked = []
for i in commList_Jieba:
    commList_Jieba_marked_dict = {}
    examples = [i["comments"]]
    # print i["comments"]
    example_counts = count_vectorizer.transform(examples)
    predictions = classifier.predict(example_counts)
    commList_Jieba_marked_dict["mark"] = predictions.tolist()
    # print predictions
Example #49
0
x = []
y = []
for i in range(len(tweets_data)):
    if tweets_data[i]['id'] == sent['id'][i]:
        x.append(tweets_data[i]['text'])
        y.append(sent['sentiment'][i])
#print(x[0].split(" "))
#print(y[0])

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

vectorizer = CountVectorizer(stop_words='english')
train_features = vectorizer.fit_transform(x)

actual = y[:-500]

nb = MultinomialNB()
nb.fit(train_features, [int(r) for r in y])

test_features = vectorizer.transform(x[:-500])

test_try = vectorizer.transform([
    "Can we all stop treating anxiety like it's a choice and something cool to have thank you"
])
test_try2 = vectorizer.transform(["I feel like drinking alchohol"])
predict2 = nb.predict(test_try)
predict3 = nb.predict(test_try2)
Example #50
0
def pipeline_train(train, test, lim_unigram):
    """

    Process train set, create relevant vectorizers

    Args:
        train: Data object, training set
        test: Data object, testing set
        lim_unigram: int, number of most frequent words to consider

    Returns:
        train_set: list, of numpy arrays
        train_stances: list, of ints
        bow_vectorizer: sklearn CountVectorizer
        tfreq_vectorizer: sklearn TfidfTransformer(use_idf=False)
        tfidf_vectorizer: sklearn TfidfVectorizer()

    """

    # Initialise
    heads = []
    heads_track = {}
    bodies = []
    bodies_track = {}
    body_ids = []
    id_ref = {}
    train_set = []
    train_stances = []
    cos_track = {}
    test_heads = []
    test_heads_track = {}
    test_bodies = []
    test_bodies_track = {}
    test_body_ids = []
    head_tfidf_track = {}
    body_tfidf_track = {}

    # Identify unique heads and bodies
    for instance in train.instances:
        head = instance['Headline']
        body_id = instance['Body ID']
        if head not in heads_track:
            heads.append(head)
            heads_track[head] = 1
        if body_id not in bodies_track:
            bodies.append(train.bodies[body_id])
            bodies_track[body_id] = 1
            body_ids.append(body_id)

    for instance in test.instances:
        head = instance['Headline']
        body_id = instance['Body ID']
        if head not in test_heads_track:
            test_heads.append(head)
            test_heads_track[head] = 1
        if body_id not in test_bodies_track:
            test_bodies.append(test.bodies[body_id])
            test_bodies_track[body_id] = 1
            test_body_ids.append(body_id)

    # Create reference dictionary
    for i, elem in enumerate(heads + body_ids):
        id_ref[elem] = i

    # Create vectorizers and BOW and TF arrays for train set
    bow_vectorizer = CountVectorizer(max_features=lim_unigram,
                                     stop_words=stop_words)
    bow = bow_vectorizer.fit_transform(heads + bodies)  # Train set only

    tfreq_vectorizer = TfidfTransformer(use_idf=False).fit(bow)
    tfreq = tfreq_vectorizer.transform(bow).toarray()  # Train set only

    tfidf_vectorizer = TfidfVectorizer(max_features=lim_unigram, stop_words=stop_words).\
        fit(heads + bodies + test_heads + test_bodies)  # Train and test sets

    # Process train set
    for instance in train.instances:
        head = instance['Headline']
        body_id = instance['Body ID']
        head_tf = tfreq[id_ref[head]].reshape(1, -1)
        body_tf = tfreq[id_ref[body_id]].reshape(1, -1)
        if head not in head_tfidf_track:
            head_tfidf = tfidf_vectorizer.transform([head]).toarray()
            head_tfidf_track[head] = head_tfidf
        else:
            head_tfidf = head_tfidf_track[head]
        if body_id not in body_tfidf_track:
            body_tfidf = tfidf_vectorizer.transform([train.bodies[body_id]
                                                     ]).toarray()
            body_tfidf_track[body_id] = body_tfidf
        else:
            body_tfidf = body_tfidf_track[body_id]
        if (head, body_id) not in cos_track:
            tfidf_cos = cosine_similarity(head_tfidf,
                                          body_tfidf)[0].reshape(1, 1)
            cos_track[(head, body_id)] = tfidf_cos
        else:
            tfidf_cos = cos_track[(head, body_id)]
        feat_vec = np.squeeze(np.c_[head_tf, body_tf, tfidf_cos])
        train_set.append(feat_vec)
        train_stances.append(label_ref[instance['Stance']])

    return train_set, train_stances, bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer
values = pickle.load(open(conf.fileValues, 'rb'))
corpusKey = conf.corpusKey

#corpus = {d:{k:corpus[d][k][:100] for k in corpus[d]} for d in corpus}

print("--------")

y = [values.index(i) for i in corpus['train'][corpusKey]]
yV = [values.index(i) for i in corpus['valid'][corpusKey]]
yT = [values.index(i) for i in corpus['test'][corpusKey]]

vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=3)
transformer = TfidfTransformer(smooth_idf=False)

#train
counts = vectorizer.fit_transform(corpus['train']['text'])
tfidf = transformer.fit_transform(counts)

#valid
countsValid = vectorizer.transform(corpus['valid']['text'])
tfidfValid = transformer.transform(countsValid)

#test
countsTest = vectorizer.transform(corpus['test']['text'])
tfidfTest = transformer.transform(countsTest)

bestAcc = 0.
for nn in range(1, 30):
    print("{}: ".format(nn), end="")
    clf = neighbors.KNeighborsClassifier(nn, weights='uniform')
    clf.fit(tfidf, y)
ps=PorterStemmer()
for w in filedata:
    filedata3.append(ps.stem(w))
   
unique = []

for fdata in filedata3:                                                          #each files data was stored in one single string therefore each string is split ...
    neww=fdata.split(' ')                                                       #...to obtain all the words from that document to be later used in VSM
    neww=neww[:-1]                                                              #each document's last index contains ' ' as a feature which is removed from every where
    unique.append(neww)                                                         #IMPORTANT NOTE: unique doesnot contain unique words of all files it is just the variable name...                    
                        
count_vec = CountVectorizer(stop_words='english',
                            ngram_range=(1, 1), max_df=0.2, min_df=0.1, max_features=None)
#count_train = count_vec.fit(filedata)
#bag_of_words = count_vec.transform(filedata)
bag_of_words2=count_vec.fit_transform(filedata)

#print('dsdsdsds',count_vec.get_feature_names())
#tfidfmatrix= count_vec.it_transform(filedata)
tfidf_vector = TfidfVectorizer( sublinear_tf = True , max_df= 0.8 , min_df = 0.1,stop_words="english")
#words= word_tokenize(tfidf_vector)
#print(tfidf_vector)

    
tfidf_matrix = tfidf_vector.fit_transform(filedata)
print(tfidf_matrix)
print('tfidf_matrix shape: ',tfidf_matrix.shape)


num_clusters = 5
#朴素叶贝斯新闻分类
from sklearn.datasets import fetch_20newsgroups
#feth_20newsgroups 需要即时从互联网下载数据
news = fetch_20newsgroups(subset='all')
#验视数据样本
print(len(news.data))


#交叉检验
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33)

#文本向量转换
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
x_train = vec.fit_transform(x_train)
x_test = vec.transform(x_test)

#导入naive_bayes模型MultinomialNB
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train,y_train)
y_predict = mnb.predict(x_test)

#性能评估
from sklearn.metrics import classification_report
print('The accuracy of Naive Bayes Classifier is ',mnb.score(x_test,y_test))
print(classification_report(y_test,y_predict,target_names=news.target_names))
Example #54
0
from sklearn import metrics


# In[495]:

# instantiate CountVectorizer
#using unigram model - most frequent 3000 words
#baseline classifier will be based on this approach - Notes from meeting with Kanchana)
stop = set(stopwords.words('english'))
vect = CountVectorizer(stop_words=stop, max_features = 3000)


# In[496]:

# fit and transform X_train into X_train_fit
X_train_fit = vect.fit_transform(X_train)
X_train_fit.shape


# In[497]:

# transform X_test into X_test_fit
X_test_fit = vect.transform(X_test)
X_test_fit.shape


# In[498]:

# import and instantiate Multinomial NB classifier
nb = MultinomialNB()
Example #55
0
# -*- coding: UTF-8 -*-
import numpy

categories = [
    'alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'
]

from sklearn.datasets import fetch_20newsgroups

twenty_train = fetch_20newsgroups(subset='train',
                                  categories=categories,
                                  shuffle=True,
                                  random_state=42)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
tfidf_transformer = TfidfTransformer()
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
Example #56
0
class FeatureExtractor(object):
    def __init__(self,
                 embedded_transformer=None,
                 external_features=None,
                 FEATURE_SCALER='StandardScaler'):
        """
        Initialize variables and check essay set type
        """
        self.transformers = None
        self.feature_type = None
        self.selected_columns = None
        self.POS_ngram = (
            2,
            3,
        )  # hard-coded, no unigrams
        self.text_types = None
        self.analysis_type = None
        self.featureselection = None
        self.embedded_transformer = embedded_transformer
        self.external_features = external_features
        self.word_embedding = None
        self.maxlen_words = None
        self.final_features = -1
        self.maxlen = None
        self.sequence_model_type = None
        self.embedding_type = None
        self.feature_scale_multiplier = None
        self.ngramMaxLength = None
        self.FEATURE_SCALER = FEATURE_SCALER
        self.sequence_vocabulary = None
        self.final_data_scaler = None  # final scaling transform of data matrix

    # apply transforms to data, used for test data, no learning takes place! No target data is used
    # order of operations is crucial! Otherwise results are nonsense
    def transform(self,
                  data_x,
                  x_meta=None,
                  x_custom=None,
                  post_transformer=None,
                  text_IDs=None,
                  stringkernels=None):

        #print('Transforming features (testing)')
        XX = []
        if 'CUSTOM' in self.feature_type:
            x = np.array(x_custom, dtype=float)
            XX.append(x)

        if 'TAGS' in self.feature_type:
            x = np.array(x_meta, dtype=float)
            XX.append(x)

        if len(XX) > 0:
            XX = np.concatenate(tuple(XX), axis=1)

        if self.analysis_type == 'SEQUENCE':
            from keras.preprocessing import sequence

            # apply fiature scaling
            if len(XX) > 0:
                XX = self.final_data_scaler.transform(XX)

            if self.sequence_model_type == 'FASTTEXT':

                X = data_x[self.text_types[0]]

                # replace unknown words with RARE_WORD
                for k1 in range(0, len(X)):
                    for k2 in range(0, len(X[k1])):
                        token = X[k1][k2]
                        if token not in self.sequence_vocabulary:
                            X[k1][k2] = 'RARE_WORD'

                # convert words to counts
                X_mat = self.transformer.transform(X)

                X = []
                for row in range(X_mat.shape[0]):
                    # how many tokens of a kind
                    tokens = [-1 for _ in range(np.sum(X_mat[row, :]))]
                    # nonzero elements
                    ind = np.argwhere(X_mat[row, :] > 0)
                    k = 0
                    for _, col in ind:
                        for _ in range(0, X_mat[row, col]):
                            tokens[k] = col + 1
                            k += 1
                    assert tokens[-1] > -1, 'Negative indices found! BUG!'
                    X.append(tokens)

                # print('Pad sequences (samples x time)')
                if len(XX) > 0:
                    X_test = [
                        sequence.pad_sequences(X, maxlen=self.maxlen), XX
                    ]
                else:
                    X_test = [sequence.pad_sequences(X, maxlen=self.maxlen)]
                # x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

            else:

                X = {'SENTENCES': [], 'FLAT': []}
                for k1, text in enumerate(data_x[self.text_types[0] +
                                                 '_SENTENCES']):
                    X['SENTENCES'].append([])
                    for k2, sent in enumerate(text):
                        if k2 < self.maxlen_doc:
                            X['SENTENCES'][-1].append([])
                            for k3, word in enumerate(sent):
                                if k3 < self.maxlen_sent:
                                    # three cases: (1) in list and dictionary (2) in dictionary (3) nowhere
                                    if word not in self.sequence_vocabulary:
                                        word = 'RARE_WORD'
                                    word_index = self.sequence_vocabulary[word]
                                    X['SENTENCES'][-1][-1].append(word_index)
                    X['FLAT'].append(
                        list(itertools.chain.from_iterable(
                            X['SENTENCES'][-1])))

                X['FLAT'] = sequence.pad_sequences(X['FLAT'],
                                                   maxlen=self.maxlen_words)

                if len(XX) > 0:
                    X_test = [X, XX]
                else:
                    X_test = [
                        X,
                    ]

        elif self.analysis_type[0:3] == 'BOW':

            # apply all transformers sequentically (same as in training)
            X_test = []
            for transformer in self.transformers:
                # apply raw data transform
                x = transformer[0].transform(
                    data_x[transformer[1]]).todense()  #.astype(np.float32)
                # apply scaling transform, identity for TFIDF
                X_test.append(x)

            # apply all selections sequentically (same as in training)
            is_selected = False
            if len(
                    X_test
            ) > 0 and self.featureselection != None and self.featureselection[
                    1] != 'global':
                is_selected = True
                if self.featureselection[1] == 'single':
                    for k, x in enumerate(X_test):
                        X_test[k] = np.take(x,
                                            indices=self.selected_columns[k],
                                            axis=1)
                elif self.featureselection[1] == 'all':
                    X_test = np.concatenate(tuple(X_test), axis=1)
                    X_test = [
                        np.take(X_test, indices=self.selected_columns, axis=1)
                    ]
                else:
                    raise (Exception(
                        'Unknown featureselection, must be single or all!'))

            # add embedding features
            if 'EMBEDDING' in self.feature_type:
                if self.embedding_type == 'LEMMA':
                    x = self.embedded_transformer.transform(
                        replace_hash(
                            data_x[self.embedding_type]))  #.astype(np.float32)
                else:
                    x = self.embedded_transformer.transform(
                        data_x[self.embedding_type])  # .astype(np.float32)
                X_test.append(x)

            # add external data, if any
            if len(XX) > 0:
                X_test.append(XX)

            X_test = np.concatenate(tuple(X_test), axis=1)

            if self.featureselection != None and self.featureselection[
                    1] == 'global':
                assert is_selected == False, 'Trying selection twice!'
                X_test = np.take(X_test, indices=self.selected_columns, axis=1)

            # apply fiature scaling
            X_test = self.final_data_scaler.transform(X_test)

            if self.analysis_type == 'BOW_StringKernel':
                assert len(set(text_IDs[0]).intersection(
                    text_IDs[1])) == 0, 'test and train data are overlapping!'
                X_stringkernel = get_stringkernel(stringkernels,
                                                  self.text_types, text_IDs,
                                                  self.ngramMaxLength)
                X_test = self.stringkernel_ratio * X_stringkernel + (
                    1.0 - self.stringkernel_ratio) * self.kernelfunction(
                        X=X_test, Y=self.kerneldata_Y)

            assert self.final_features == X_test.shape[
                1], 'Final feature size not equal!'

        if post_transformer is not None:
            X_test = post_transformer(X_test)

        return X_test

    # get best features
    def get_best(self, x, pass2_features):
        ind = np.argsort(x)
        ind = np.flipud(ind)
        assert x[ind[0]] == max(x), 'sort failed!'
        return ind[0:pass2_features]

    # method to choose columns
    def column_selector(self, X, Y, type, pass2_features):
        if type == 'regression':
            val = f_regression(X, Y)
            val = val[0] / np.max(val[0])  # these are f-values!
            return self.get_best(val, pass2_features)
        elif type == 'fisher':
            return self.fisher_selector(Y, X, pass2_features)
        elif type == 'chi2':
            return self.chi2_selector(Y, X, pass2_features)
        elif type == 'mutualinfo':
            val = mutual_info_regression_partial(X, Y)
            val = val / np.max(val)
            return self.get_best(val, pass2_features)
        else:
            raise (Exception('Unknown method'))

    def chi2_selector(self, set_score, dict_mat, max_feats_pass2):
        med_score = np.median(set_score)
        new_score = set_score
        new_score[set_score < med_score] = 0
        new_score[set_score >= med_score] = 1

        ch2 = SelectKBest(chi2, k=max_feats_pass2)
        ch2.fit(dict_mat, new_score)
        good_cols = ch2.get_support(indices=True)
        return good_cols

    def fisher_selector(self, set_score, dict_mat, max_feats_pass2):
        med_score = np.median(set_score)
        new_score = set_score
        new_score[set_score < med_score] = 0
        new_score[set_score >= med_score] = 1

        new_score_1 = new_score == 1
        new_score_0 = new_score == 0

        fish_vals = np.empty(dict_mat.shape[1])
        fish_vals[:] = np.nan

        for col_num in range(0, dict_mat.shape[1]):

            # loop_vec = np.squeeze(np.asarray(dict_mat[:, col_num]))
            # good_loop_vec = loop_vec[new_score == 1]
            # bad_loop_vec = loop_vec[new_score == 0]
            # good_loop_present = len(good_loop_vec[good_loop_vec > 0])
            # good_loop_missing = len(good_loop_vec[good_loop_vec == 0])
            # bad_loop_present = len(bad_loop_vec[bad_loop_vec > 0])
            # bad_loop_missing = len(bad_loop_vec[bad_loop_vec == 0])

            loop_vec = dict_mat[:, col_num]
            good_loop_vec = loop_vec[new_score_1]
            bad_loop_vec = loop_vec[new_score_0]
            good_loop_present = np.sum(good_loop_vec != 0)
            good_loop_missing = np.sum(good_loop_vec == 0)
            bad_loop_present = np.sum(bad_loop_vec != 0)
            bad_loop_missing = np.sum(bad_loop_vec == 0)

            fish_vals[col_num] = pvalue(good_loop_present, bad_loop_present,
                                        good_loop_missing,
                                        bad_loop_missing).two_tail

        cutoff = 1
        if (len(fish_vals) > max_feats_pass2):
            cutoff = sorted(fish_vals)[max_feats_pass2]
        good_cols = np.asarray([
            num for num in range(0, dict_mat.shape[1])
            if fish_vals[num] <= cutoff
        ])
        return good_cols

    # tf-idf weighted transformer for document embedding
    class TfidfEmbeddingVectorizer(object):
        def __init__(self, word2vec, dim):
            self.word2vec = word2vec
            self.word2weight = None
            self.dim = dim

        def fit(self, X, y=None):
            tfidf = TfidfVectorizer(analyzer=lambda x: x)
            tfidf.fit(X)
            # if a word was never seen - it must be at least as infrequent
            # as any of the known words - so the default idf is the max of
            # known idf's
            max_idf = max(tfidf.idf_)
            self.word2weight = defaultdict(
                lambda: max_idf,
                [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
            return self

        def transform(self, X, y=None):
            return np.array([
                np.mean([
                    self.word2vec[w] * self.word2weight[w]
                    for w in words if w in self.word2vec
                ] or [np.zeros(self.dim)],
                        axis=0) for words in X
            ])

    def main(self,
             data_x,
             data_y,
             Params,
             x_meta=None,
             x_custom=None,
             print_info=False,
             text_IDs=None,
             stringkernels=None):

        if Params['Algorithm'][0] == 'SEQUENCE':
            self.analysis_type = 'SEQUENCE'
        else:
            self.analysis_type = 'BOW'

        self.transformers = []
        self.text_types = Params['TextTypes']
        self.feature_type = Params['FeatureMethod']
        if self.feature_type is None:
            self.feature_type = []

        # custom text features
        if 'CUSTOM' in self.feature_type:
            assert x_meta != None, 'Customdata not set!'
            if print_info:
                start_time = time.time()
                print('... adding (custom) count measures', end='')
            x = np.array(x_custom[1], dtype=float)
            x_label = x_custom[0]
            X_custom = x
            X_custom_columns = x_label
            if print_info:
                end_time = time.time()
                print(' ... done (%1.1fs)' % (end_time - start_time))

        # tag features
        if 'TAGS' in self.feature_type:
            assert x_meta != None, 'Metadata not set!'
            if print_info:
                start_time = time.time()
                print('... adding metainfo', end='')
            x = np.array(x_meta[1], dtype=float)
            x_label = x_meta[0]
            X_tags = x
            X_tags_columns = x_label
            if print_info:
                end_time = time.time()
                print(' ... done (%1.1fs)' % (end_time - start_time))

        if self.analysis_type == 'SEQUENCE':

            from keras.preprocessing import sequence
            # convert text to index sequences, returns
            # data = text x sentence x word
            XX = []
            XX_columns = []
            if 'CUSTOM' in self.feature_type:
                XX.append(X_custom)
                XX_columns.append(X_custom_columns)

            if 'TAGS' in self.feature_type:
                XX.append(X_tags)
                XX_columns.append(X_tags_columns)

            if len(XX) > 0:
                XX = np.concatenate(tuple(XX), axis=1)
                self.final_data_scaler = get_scaler(self.FEATURE_SCALER)
                XX = self.final_data_scaler.fit_transform(XX)

            X = data_x[self.text_types[0]]
            max_sequence = max([len(x) for x in X])

            if Params['Algorithm'][1]['algorithm'] == 'FASTTEXT':

                self.sequence_model_type = 'FASTTEXT'

                X = [x[0:np.minimum(max_sequence, len(x))] for x in X]

                # get all words that appeared at least in two articles
                transformer = CountVectorizer(tokenizer=lambda x: x,
                                              preprocessor=lambda x: x,
                                              max_df=1.0,
                                              min_df=2,
                                              max_features=50000,
                                              ngram_range=(1, 1))
                transformer.fit(X)

                for k1 in range(0, len(X)):
                    for k2 in range(0, len(X[k1])):
                        token = X[k1][k2]
                        if token not in transformer.vocabulary_:
                            X[k1][k2] = 'RARE_WORD'

                self.transformer = CountVectorizer(
                    tokenizer=lambda x: x,
                    preprocessor=lambda x: x,
                    max_df=1.0,
                    min_df=2,
                    max_features=100000,
                    ngram_range=(1, Params['Algorithm'][1]['ngram']))
                X_mat = self.transformer.fit_transform(X)

                self.sequence_vocabulary = {
                    key: (val + 1)
                    for key, val in self.transformer.vocabulary_.items()
                }  # additional tokens for empty and unknown word

                assert 'PADDED_WORD' not in self.transformer.vocabulary_

                self.sequence_vocabulary['PADDED_WORD'] = 0

                ind2word = [
                    '' for x in range(0, len(self.sequence_vocabulary))
                ]
                for word in self.sequence_vocabulary.keys():
                    ind2word[self.sequence_vocabulary[word]] = word

                maxlen_words = 0
                X = []
                for row in range(X_mat.shape[0]):
                    tokens = [-1 for _ in range(np.sum(X_mat[row, :]))]
                    ind = np.argwhere(X_mat[row, :] > 0)
                    k = 0
                    for _, col in ind:
                        for _ in range(0, X_mat[row, col]):
                            tokens[k] = col + 1
                            k += 1
                    maxlen_words = np.maximum(maxlen_words, len(tokens))
                    X.append(tokens)

                #maxlen_words = np.minimum(Params['Algorithm'][1]['max_sequence'], maxlen_words)
                self.maxlen = maxlen_words

                # print('Pad sequences (samples x time)')
                #X = [sequence.pad_sequences(X, maxlen=self.maxlen)]
                # x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

                # print('Pad sequences (samples x time)')
                if len(XX) > 0:
                    X = [sequence.pad_sequences(X, maxlen=self.maxlen), XX]
                    X_columns = [self.sequence_vocabulary, XX_columns]
                else:
                    X = [sequence.pad_sequences(X, maxlen=self.maxlen)]
                    X_columns = [self.sequence_vocabulary]

            else:

                max_sequence = np.minimum(
                    max_sequence, Params['Algorithm'][1]['max_seq_length'])
                X = [x[0:np.minimum(max_sequence, len(x))] for x in X]

                # get all words that appeared at least in two articles
                transformer = CountVectorizer(tokenizer=lambda x: x,
                                              preprocessor=lambda x: x,
                                              max_df=1.0,
                                              min_df=2,
                                              max_features=20000,
                                              ngram_range=(1, 1))
                transformer.fit(X)
                self.sequence_vocabulary = {
                    key: (val + 2)
                    for key, val in transformer.vocabulary_.items()
                }  # additional tokens for empty and unknown word

                assert 'UNKNOWN_WORD' not in self.sequence_vocabulary
                assert 'PADDED_WORD' not in self.sequence_vocabulary

                self.sequence_vocabulary['PADDED_WORD'] = 0
                self.sequence_vocabulary['RARE_WORD'] = 1

                # compute mean and mean norm of all word vectors
                sumvec = 0.0
                sumnorm = 0.0
                for k, word in enumerate(
                        self.external_features.word_embedding):
                    vec = self.external_features.word_embedding[word]
                    sumvec += vec
                    sumnorm += np.linalg.norm(vec)
                    if k > 10000:
                        break
                vec_mean = 0 * sumvec / (k + 1)
                vec_norm = sumnorm / (k + 1)
                EMBEDDING_DIM = len(vec_mean)

                def get_random_vec():
                    # generate random vector with same mean and norm as embeddings on avarage
                    a = 2 * np.random.rand(EMBEDDING_DIM) - 1
                    #a = a + vec_mean
                    a = (a / np.linalg.norm(a)) * vec_norm
                    return a

                word_embedding = {}
                word_embedding['RARE_WORD'] = get_random_vec()
                word_embedding['PADDED_WORD'] = 0

                X = {
                    'FLAT': [],
                    'SENTENCES': []
                }  # index matrix with splitted sentences

                maxlen_doc = 0
                maxlen_sent = 0
                maxlen_words = 0
                unknown_words = set()
                total_words = [0, 0]

                # convert tokens to indices, keep sentences
                for k1, text in enumerate(data_x[self.text_types[0] +
                                                 '_SENTENCES']):
                    X['SENTENCES'].append([])
                    words = 0
                    maxlen_doc = np.maximum(maxlen_doc, len(text))
                    for k2, sent in enumerate(text):
                        X['SENTENCES'][-1].append([])
                        words += len(sent)
                        maxlen_words = np.maximum(maxlen_words, words)
                        maxlen_sent = np.maximum(maxlen_sent, len(sent))
                        if len(sent) == maxlen_sent:
                            maxlen_sent_example = sent
                        for k3, word in enumerate(sent):
                            lemma_word = data_x['LEMMA_SENTENCES'][k1][k2][k3]
                            lemma_word = lemma_word.replace('#', '')

                            total_words[0] += 1
                            # three cases: (1) in list and dictionary (2) in dictionary (3) nowhere
                            vec = None
                            if word in self.external_features.word_embedding:
                                # word has embeddings
                                vec = self.external_features.word_embedding[
                                    word]
                            elif lemma_word in self.external_features.word_embedding:
                                # lemma has embedding, use that instead
                                vec = self.external_features.word_embedding[
                                    lemma_word]
                                word_embedding[word] = vec
                            if word in self.sequence_vocabulary:
                                # word must have embedding, even a random one
                                if vec is None:
                                    vec = get_random_vec()  # null vector
                                    word_embedding[word] = vec
                            else:
                                if vec is None:  # word not in vocabulary and no embedding, mark as unknown
                                    word = 'RARE_WORD'
                                    total_words[1] += 1
                                    unknown_words.add(word)
                                else:  # word not in vocabulary but has embedding,
                                    self.sequence_vocabulary[word] = len(
                                        self.sequence_vocabulary)

                            word_index = Params['sequence_vocabulary'][word]
                            X['SENTENCES'][-1][-1].append(word_index)

                    X['FLAT'].append(
                        list(itertools.chain.from_iterable(
                            X['SENTENCES'][-1])))

                X['FLAT'] = sequence.pad_sequences(X['FLAT'],
                                                   maxlen=maxlen_words)

                assert (total_words[1] / total_words[0]
                        ) < 0.10, 'over 10% of words (tokens) are unknown!'

                vals = sorted([
                    self.sequence_vocabulary[key]
                    for key in self.sequence_vocabulary
                ])

                assert np.max(vals) + 1 == len(vals)

                self.maxlen_words = maxlen_words
                self.maxlen_doc = maxlen_doc
                self.maxlen_sent = maxlen_sent
                self.max_unique_words = len(self.sequence_vocabulary)

                # vals = sorted([self.transformer.vocabulary_[key] for key in self.transformer.vocabulary_.keys()])
                W = np.zeros((self.max_unique_words, EMBEDDING_DIM),
                             dtype=np.float32)
                W.fill(np.nan)

                ind2word = [
                    '' for x in range(0, len(self.sequence_vocabulary))
                ]
                for word in self.sequence_vocabulary.keys():
                    W[self.sequence_vocabulary[word]] = word_embedding[word]
                    ind2word[self.sequence_vocabulary[word]] = word

                #for k,word in Params['sequence_vocabulary']

                #
                if 0:
                    data_x_check = []
                    for k1 in range(0, len(X['FLAT'])):
                        data_x_check.append([])
                        for k2 in range(0, len(X['FLAT'][k1])):
                            data_x_check[k1].append(
                                ind2word[X['FLAT'][k1][k2]])

                Params['W_embedding_matrix'] = W
                Params['max_document_sentences'] = maxlen_doc
                Params['max_sentence_words'] = maxlen_sent
                Params['max_words_in_doc'] = maxlen_words
                Params['max_unique_words'] = self.max_unique_words
                self.word_embedding = word_embedding

                if len(XX) > 0:
                    X = [X, XX]
                    X_columns = 'sequence data (up to %i words) + metadata (% items)' % (
                        maxlen_words, XX.shape[1])
                else:
                    X = [
                        X,
                    ]
                    X_columns = 'sequence data (up to %i words)' % maxlen_words

        elif self.analysis_type == 'BOW':

            # feature selection type, only for BOW algorithms (not including fasttext)
            self.featureselection = Params['FeatureSelection']

            if not isinstance(self.text_types, list) and not isinstance(
                    self.text_types, tuple):
                self.text_types = [self.text_types]

            if print_info:
                print('\nBuilding and transforming features (training phase)')

            X = []
            X_columns = []
            for feature in self.feature_type:
                for text_type in self.text_types:
                    if feature == 'TFIDF':
                        if text_type == 'POS':
                            ngram_range = self.POS_ngram
                        else:
                            ngram_range = Params['TFIDF_ngram']
                        if print_info:
                            start_time = time.time()
                            print('... adding TF-IDF (%s, ngram=%s)' %
                                  (text_type, str(ngram_range)),
                                  end='')
                        self.transformers.append((TfidfVectorizer(
                            tokenizer=lambda x: x,
                            preprocessor=lambda x: x,
                            max_df=1.0,
                            min_df=2,
                            use_idf=True,
                            max_features=Params['pass1_features'],
                            ngram_range=ngram_range), text_type))
                        x = self.transformers[-1][0].fit_transform(
                            data_x[text_type]).todense()
                        X.append(x)
                        x = self.transformers[-1][0].get_feature_names()
                        x = [
                            'term=' + y + ',type=%s+TFIDF' % text_type
                            for y in x
                        ]
                        X_columns.append(x)
                        if print_info:
                            end_time = time.time()
                            print(' ... done (%1.1fs)' %
                                  (end_time - start_time))
                    elif feature == 'BOW':
                        if text_type == 'POS':
                            ngram_range = self.POS_ngram
                        else:
                            ngram_range = Params['BOW_ngram']
                        if print_info:
                            start_time = time.time()
                            print('... adding BOW (%s, ngram=%s)' %
                                  (text_type, str(ngram_range)),
                                  end='')
                        self.transformers.append((CountVectorizer(
                            tokenizer=lambda x: x,
                            preprocessor=lambda x: x,
                            max_df=1.0,
                            min_df=2,
                            max_features=Params['pass1_features'],
                            ngram_range=ngram_range,
                            dtype=np.float32), text_type))
                        x = self.transformers[-1][0].fit_transform(
                            data_x[text_type]).todense()
                        X.append(x)
                        x = self.transformers[-1][0].get_feature_names()
                        x = [
                            'term=' + y + ',type=%s+BOW' % text_type for y in x
                        ]
                        X_columns.append(x)
                        if print_info:
                            end_time = time.time()
                            print(' ... done (%1.1fs)' %
                                  (end_time - start_time))
                    else:
                        pass

            # do feature selection for individual BOW features or all of them
            is_selected = False
            if len(
                    X
            ) > 0 and self.featureselection != None and self.featureselection[
                    1] != 'global':
                is_selected = True
                if print_info:
                    start_time = time.time()
                    print('... doing feature selection (type=%s)' %
                          str(self.featureselection),
                          end='')
                self.selected_columns = []
                if self.featureselection[1] == 'single':
                    for k, x in enumerate(X):
                        self.selected_columns.append(
                            self.column_selector(
                                x, data_y.copy(),
                                Params['FeatureSelection'][0],
                                Params['FeatureSelection'][2]))
                        X[k] = np.take(x,
                                       indices=self.selected_columns[-1],
                                       axis=1)
                        X_columns[k] = [
                            X_columns[k][kk]
                            for kk in self.selected_columns[-1]
                        ]
                elif self.featureselection[1] == 'all':
                    X = np.concatenate(tuple(X), axis=1)
                    self.selected_columns = self.column_selector(
                        X, data_y.copy(), Params['FeatureSelection'][0],
                        Params['FeatureSelection'][2])
                    X = [np.take(X, indices=self.selected_columns, axis=1)]
                    X_columns = list(itertools.chain.from_iterable(X_columns))
                    X_columns = [
                        list([X_columns[kk] for kk in self.selected_columns])
                    ]
                else:
                    raise (Exception(
                        'featureselection property must be single or all!'))
                if print_info:
                    end_time = time.time()
                    print(' ... done (%1.1fs)' % (end_time - start_time))

            # tf-ifd weighted document embedding
            if 'EMBEDDING' in self.feature_type:
                if print_info:
                    start_time = time.time()
                    print(
                        '... adding embedded document vectors (dim %i) with tf-idf scaling'
                        % self.external_features.embedding_dim,
                        end='')
                self.embedding_type = Params['EMBEDDING_type']
                if self.embedded_transformer == None:
                    self.embedded_transformer = self.TfidfEmbeddingVectorizer(
                        self.external_features.word_embedding,
                        self.external_features.embedding_dim)
                    if self.embedding_type == 'LEMMA':
                        self.embedded_transformer.fit(
                            replace_hash(data_x[self.embedding_type]))
                    else:
                        self.embedded_transformer.fit(
                            data_x[self.embedding_type])
                if self.embedding_type == 'LEMMA':
                    x = self.embedded_transformer.transform(
                        replace_hash(
                            data_x[self.embedding_type]))  #.astype(np.float32)
                else:
                    x = self.embedded_transformer.transform(
                        data_x[self.embedding_type])  # .astype(np.float32)
                X.append(x)
                X_columns.append([
                    'emb%i_%3.0f' % (self.external_features.embedding_dim, kk)
                    for kk in range(1, self.external_features.embedding_dim +
                                    1)
                ])
                if print_info:
                    end_time = time.time()
                    print(' ... done (%1.1fs)' % (end_time - start_time))

            if 'CUSTOM' in self.feature_type:
                X.append(X_custom)
                X_columns.append(X_custom_columns)

            if 'TAGS' in self.feature_type:
                X.append(X_tags)
                X_columns.append(X_tags_columns)

            X = np.concatenate(tuple(X), axis=1)

            X_columns = list(itertools.chain.from_iterable(X_columns))

            # do global feature selection
            if self.featureselection != None and self.featureselection[
                    1] == 'global':
                assert is_selected == False, 'Trying selection twice!'
                if print_info:
                    start_time = time.time()
                    print('... doing feature selection (type=%s)' %
                          str(self.featureselection),
                          end='')
                self.selected_columns = self.column_selector(
                    X, data_y.copy(), Params['FeatureSelection'][0],
                    Params['FeatureSelection'][2])
                X = np.take(X, indices=self.selected_columns, axis=1)
                X_columns = list(
                    [X_columns[kk] for kk in self.selected_columns])

                if print_info:
                    end_time = time.time()
                    print(' ... done (%1.1fs)' % (end_time - start_time))

            assert X.shape[1] == len(
                X_columns), 'X and X_labels have different size! BUG!'

            self.final_data_scaler = get_scaler(self.FEATURE_SCALER)

            if self.FEATURE_SCALER is not 'StandardScaler':
                temp_scaler = get_scaler('StandardScaler')
                temp_scaler.fit(X)
                self.feature_scale_multiplier = temp_scaler.scale_
                X = self.final_data_scaler.fit_transform(X)
                self.feature_scale_multiplier = self.feature_scale_multiplier / self.final_data_scaler.scale_
            else:
                self.feature_scale_multiplier = np.ones(X.shape[1])
                X = self.final_data_scaler.fit_transform(X)

            if Params['Algorithm'][0] == 'StringKernel':
                self.analysis_type = 'BOW_StringKernel'
                self.ngramMaxLength = Params['Algorithm'][1]['ngram']
                X_stringkernel = get_stringkernel(stringkernels,
                                                  self.text_types, text_IDs,
                                                  self.ngramMaxLength)
                X_columns = [
                    'String kernels for %s' % " ".join(self.text_types)
                ]
                self.kerneldata_Y = X
                self.kernelfunction = get_kernel(
                    Params['Algorithm'][1]['kerneltype'])
                self.stringkernel_ratio = Params['Algorithm'][1][
                    'stringkernel_ratio']
                X = (self.stringkernel_ratio * X_stringkernel
                     ) + (1.0 - self.stringkernel_ratio) * self.kernelfunction(
                         X=X, Y=None)

            self.final_features = X.shape[1]

        else:
            raise (Exception(
                'Unknown analysis type (should be sequence or classical)'))

        return X, X_columns, Params
class Reader_APNEWS:
    """
    This class is responsible for preprocessing the newsgroup data as well as creating batches to train.
    the input is always a list with all documents:
    """
    def __init__(self,
                 datapath,
                 n_features=100000,
                 lm_minimum_freq=5,
                 train_perc=0.6,
                 valid_perc=0.2,
                 language="english",
                 length_batch=10,
                 batch_size=5,
                 sample_size=10000):
        #data preprocessing
        #todo: remove limiting number of samples
        random.seed(1)

        self.language = language
        self.lm_minimum_freq = lm_minimum_freq
        self.train_perc = train_perc
        self.valid_perc = valid_perc
        self.length_batch = length_batch
        self.batch_size = batch_size
        data = self.get_data(datapath)[:sample_size]
        print("len data:", len(data))

        # print("len data", len(data))
        # print(data[:2])
        self.data_samples = self.preprocessing_general(self.shuffle(data))

        # print(self.data_samples[:2])
        self.data_tm = self.preprocessing_tm(self.data_samples)

        #use for ntm model
        self.data_prepped = [
            self.process_doc(doc, i) for i, doc in enumerate(self.data_samples)
        ]

        self.tf_vectorizer = CountVectorizer(max_df=0.95,
                                             min_df=10,
                                             max_features=n_features,
                                             stop_words=self.language)

        #first fit the matrix on the train set
        self.tf_vectorizer.fit_transform(
            self.data_tm[:int(len(self.data_tm) * train_perc)])
        self.tf = self.reluDerivative(
            self.tf_vectorizer.transform(self.data_tm))

        self.idx2word = self.tf_vectorizer.get_feature_names()
        self.vocab_size = np.shape(self.tf)[1]
        print("vocab size", self.vocab_size)

        #LM data
        self.train, self.valid, self.test, self.lm_id2word, self.lm_word2id, self.lm_vocab_size = self.preprocessing_lm(
            data=self.data_samples, minimum_tf=lm_minimum_freq)

    def get_data(self, datapath):
        with open(datapath) as f:
            content = f.readlines()
        # you may also want to remove whitespace characters like `\n` at the end of each line
        content = [x.strip() for x in content]
        return content

    def shuffle(self, x):
        x_new = [[doc] for doc in x]
        random.shuffle(x_new)
        return [x[0] for x in x_new]

    # takes data in the form of list of strings
    def preprocessing_lm(self, data, minimum_tf):
        # gets tf from corpus
        def get_tf(d):
            tf = defaultdict(int)
            for doc in d:
                for sen in doc:
                    for word in sen:
                        tf[word] += 1
            return tf

        def create_vocab(data):
            idx2word = []
            word2idx = dict()
            for doc in data:
                for sen in doc:
                    for word in sen:
                        if word not in word2idx:
                            word2idx[word] = len(idx2word)
                            idx2word.append(word)
            word2idx["<EOS>"] = len(idx2word)
            idx2word.append("<EOS>")
            word2idx["<BOS>"] = len(idx2word)
            idx2word.append("<BOS>")
            word2idx["<PAD>"] = len(idx2word)
            idx2word.append("<PAD>")
            return idx2word, word2idx

        def remove_numbers(data):
            return [[[
                word if not word.isdigit() else "<NUMBER>" for word in sen
            ] for sen in doc] for doc in data]

        # removes rare words
        def remove_rare_words(data, tf, min_freq):
            return [[[
                word if tf[word] >= min_freq else "<UNK>" for word in sen
            ] for sen in doc] for doc in data]

        def create_language_model_data(data, word2idx):
            lm_data = []
            for doc in data:
                if doc == []:
                    lm_data.append(None)
                    continue
                doc_new = [copy.deepcopy(sen) for sen in doc]
                doc_new[0].insert(0, word2idx["<EOS>"])

                for sen in doc_new:
                    sen.append(word2idx["<EOS>"])
                lm_data.append(doc_new)

                # print( lm_data)
            lm_data = [
                list(itertools.chain.from_iterable(doc))
                if doc != None else None for doc in lm_data
            ]
            return lm_data

        def get_batch_data(data):
            def create_batches(d, batch_size=1, lstm_length=20):
                batches = len(d) // (lstm_length * batch_size)
                if batches == 0:
                    # print( "peep peep")
                    return None
                cutoff = batches * lstm_length * batch_size
                d = np.array(d[:cutoff])
                # for larger batch size
                d = d.reshape((batch_size, batches * lstm_length))
                # horizontal split
                output = np.hsplit(
                    d, [i * lstm_length for i in range(1, batches)])
                # output = d.reshape(-1, 1, lstm_length)
                return output

            x = copy.deepcopy(data[:-1])
            y = copy.deepcopy(data[1:])
            x_batch = create_batches(x,
                                     batch_size=self.batch_size,
                                     lstm_length=self.length_batch)
            y_batch = create_batches(y,
                                     batch_size=self.batch_size,
                                     lstm_length=self.length_batch)
            if x_batch == None:
                return None

            return [(x_batch[i], y_batch[i]) for i in range(len(x_batch))]

        data_listform = [[
            word_tokenize(y, language=self.language)
            for y in sent_tokenize(x, language=self.language)
        ] for x in data]
        #get tf for train set

        # with open('coherence_data/apnews/corpus.0', 'w') as f:
        #     for doc in data_listform:
        #         doc = " ".join([item for sublist in doc for item in sublist])
        #         f.write(doc + "\n")

        tf_train = get_tf(
            data_listform[:int(len(data_listform) * self.train_perc)])
        data_listform = remove_numbers(data_listform)
        data_listform = remove_rare_words(data_listform,
                                          tf_train,
                                          min_freq=self.lm_minimum_freq)

        # statistic purposes
        sp = [len(x) for x in data_listform]
        print("min number of words in a document:", min(sp))
        print("max number of words in a document:", max(sp))
        print("average number of words:", sum(sp) / len(sp))

        idx2word, word2idx = create_vocab(data_listform)

        tokenized_data = [[[word2idx[word] for word in sen] for sen in doc]
                          for doc in data_listform]

        language_model_data = create_language_model_data(
            tokenized_data, word2idx)

        new_tf = copy.deepcopy(self.tf)
        new_data_set = [
            {
                "doc_tm": x,
                "doc_tm_sparse": np.where(x > 0)[0],
                "doc_lm": get_batch_data(language_model_data[i])
            } for i, x in enumerate(new_tf)
            if len(np.where(x > 0)[0]) > 0 and language_model_data[i] != None
            and get_batch_data(language_model_data[i]) != None
        ]
        total_length = len(new_data_set)
        train_idx = int(total_length * self.train_perc)
        valid_idx = int(total_length * (self.train_perc + self.valid_perc))
        train = new_data_set[:train_idx]
        valid = new_data_set[train_idx:valid_idx]
        test = new_data_set[valid_idx:]

        return train, valid, test, idx2word, word2idx, len(idx2word)

    def get_sets(self, valid_perc=0.2):
        new_tf = copy.deepcopy(self.tf)
        # here we add the indices that are on and remove documents that contain no words that are in te vocab
        # the third variable is the text
        new_data_set = [{
            "doc_tm": x,
            "doc_tm_1": np.where(x > 0)[0],
            "doc_lm": self.language_model_data[i]
        } for i, x in enumerate(new_tf) if len(np.where(
            x > 0)[0]) > 0 and self.language_model_data[i] != None]
        total_length = len(new_data_set)
        train_idx = int(total_length * self.train_perc)
        valid_idx = int(total_length * (self.train_perc + valid_perc))

        train = new_data_set[:train_idx]
        valid = new_data_set[train_idx:valid_idx]
        test = new_data_set[valid_idx:]
        return train, valid, test

    # removes lowercase, lemmatize? , stem?
    def preprocessing_general(self,
                              data,
                              remove_the_uppercase=True,
                              remove_the_numbers=False,
                              stem=False,
                              lemmatize=False):
        def remove_uppercase(data):
            new_data = []
            for x in data:
                new_data.append(x.lower())
            return new_data

        def remove_numbers(d):
            new_data = [[
                word_tokenize(y, language=self.language)
                for y in sent_tokenize(x, language=self.language)
            ] for x in d]
            data_no_digits = [[[
                word if not word.isdigit() else "<NUMBER>" for word in sen
            ] for sen in doc] for doc in new_data]

            return [
                " ".join([" ".join([word for word in s]) for s in doc])
                for doc in data_no_digits
            ]

        new_data = data
        if remove_the_uppercase:
            print("replacing uppercase by lowercase")
            new_data = remove_uppercase(new_data)

        if remove_the_numbers:
            print("removing numbers from general data")
            new_data = remove_numbers(new_data)

        return new_data

    def preprocessing_tm(self, data):
        return data

    def process_doc(self, doc, i):
        """"this function preprocesses the documents
        """

        sentences = sent_tokenize(doc)
        output_data = [word_tokenize(s) for s in sentences]
        return output_data

    def reluDerivative(self, input):
        x = input.toarray()
        x[x <= 0] = 0
        x[x > 0] = 1
        return x
    'Seed1-Napier', 'Seed2-Devon', 'Seed3-Richmond', 'Seed4-Bessborough'
]
seed['YEAR'] = [1884, 1845, 1882, 1881]
seed = seed[['BILL', 'YEAR', 'SPEECH_ACT']]
# append to end of text df
text = pd.concat([text, seed]).reset_index(drop=True)

# now that the raw data has been processed, we build up the dictionary
# prepare the corpus
corpus = list(text['SPEECH_ACT'])
nr_docs = 10e0**np.linspace(0, 7, num=8)
max_df = (nr_docs + 0.5) / len(corpus)
# get unique words, remove special chars, spellcheck, lemma/stem
for i in range(len(nr_docs)):
    vectorizer = CountVectorizer(max_df=max_df[i])
    vec = vectorizer.fit_transform(corpus)
    words = vectorizer.get_feature_names()
    # remove words with special characters and numbers in them
    words_nonr = [word for word in words if word.isalpha()]
    # correctly and incorrectly spelled english words
    words_en = [word for word in words_nonr if dictionary.check(word)]
    words_nonen = [
        word for word in words_nonr if dictionary.check(word) == False
    ]
    # lemmatize
    # orig_lemmas = [word for word in words_en if lemmatizer.lemmatize(word) is not None]
    # lemmas = [lemmatizer.lemmatize(word) for word in words_en]
    # stem
    orig_stems = [word for word in words_en if stemmer.stem(word) is not None]
    stems = [stemmer.stem(word) for word in words_en]
# create dictionary from lists
Example #59
0
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
pd.options.mode.chained_assignment = None
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import cross_val_score, train_test_split

df = pd.read_csv("../prediction_app/static/merged_data.csv")
essay_df = df[['_projectid', 'RESP', ' essay']]
essay_df['new_essay'] = essay_df[' essay'].map(lambda x: type(x))
essay_df = essay_df[essay_df.new_essay == str]
print "done throwing out floats"
print "percent remaining", len(essay_df) / len(df)
essay_df.new_essay = essay_df[' essay'].map(lambda x: x.decode('utf-8'))
print "done decoding"

documents = essay_df.new_essay.tolist()
classes = essay_df.RESP.tolist()

vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 2))
doc_vectors = vectorizer.fit_transform(documents)
print "done vectorizing" \
      ""
model = MultinomialNB().fit(doc_vectors, classes)
print "done fitting model"

precision = np.mean(
    cross_val_score(model, doc_vectors, classes, scoring='precision'))
cm = confusion_matrix(classes, model.predict(doc_vectors))
print "Precision", precision
print "Percentage off", cm[0][1] / (cm[0][0] + cm[0][1])
print cm
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Fitting Max Entropy Classification to the Training set
from Scipy
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)