Ejemplo n.º 1
0
def vectorize_test(texts, vocab=[]):
  vectorizer = CountVectorizer(min_df=0, stop_words="english") 
  if len(vocab) > 0:
    vectorizer = CountVectorizer(min_df=0, stop_words="english", 
      vocabulary=vocab)
  features = vectorizer.transform(texts)
  return features 
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.custom_stop_words)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        working_batch_size = len(tweets)
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), working_batch_size)
            # Only deal with tweets that are longer than 3 words.
            neighbors = tree2.radius_neighbors(x, radius = self.sensitivity)[1]
            if x.getnnz() > 2:
                n_neighbors.append(len(neighbors[0]))
                neighbors_indices.append(neighbors)
            else:
                n_neighbors.append(1)
                neighbors_indices.append(np.array([np.array([0])]))

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices
Ejemplo n.º 3
0
def handle_doc(word_set,rs_path):
    doc_dir = os.listdir(rs_path)
    doc_matrix = []
    doc_cat = []
    for docs in doc_dir:
        files = os.listdir(rs_path+docs)
        print "start to handle the -->  "+docs
        for file_d in files:
            d_path = rs_path+docs+'/'+file_d
            #get the single file path
            with open(d_path,'rb') as text_file:
                str_tmp = ''
                file_lines = text_file.readlines()
                for line in file_lines:
                    pattern = r'''[a-zA-Z]+'''
                    tokens = nltk.regexp_tokenize(line,pattern)
                    for t in tokens:
                        if t.lower() in word_set:
                            str_tmp += t.lower()
                            str_tmp += ' '
                doc_matrix.append(str_tmp)
                doc_cat.append(cat_dic[docs])
            text_file.close()
    str_tmp = ''
    for sw in word_set:
        str_tmp += sw
        str_tmp += ' '
    doc_matrix.append(str_tmp)
    doc_cat.append('NAN')
    vectorizer = CountVectorizer()
    doc_num = vectorizer.fit_transform(doc_matrix)
    tfidf = TfidfTransformer()
    doc_tfidf = tfidf.fit_transform(doc_num)
    return doc_tfidf[:-1,:],doc_cat[:-1]
Ejemplo n.º 4
0
def dtm_matrix(lessonpath):
    # lesson number
    lessonname = lessonpath.split("/")[-2]

    # creating corpus of txt files
    corpusText(lessonpath)

    # finding the paths of the text files
    corpuspath = "C:/Users/eabalo/Desktop/STAAR35Analyses/data/corpus"
    filepaths = glob.glob(corpuspath + "/" + lessonname + "/*.txt")

    # script names
    docindex = [w.split("-")[-1].split(".")[0] for w in filepaths]

    # building a document-term matrix
    vectorizer = CountVectorizer(input="filename")

    dtm = vectorizer.fit_transform(filepaths)

    # lexicon of words in lesson
    # vocab = vectorizer.get_feature_names()

    # converting to numpy arrays
    dtm = dtm.toarray()

    # vocab = np.array(vocab)

    return dtm, docindex, lessonname
Ejemplo n.º 5
0
def test_vectorizer_unicode():
    # tests that the count vectorizer works with cyrillic.
    document = (
        "\xd0\x9c\xd0\xb0\xd1\x88\xd0\xb8\xd0\xbd\xd0\xbd\xd0\xbe\xd0"
        "\xb5 \xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb5\xd0\xbd\xd0\xb8\xd0"
        "\xb5 \xe2\x80\x94 \xd0\xbe\xd0\xb1\xd1\x88\xd0\xb8\xd1\x80\xd0\xbd"
        "\xd1\x8b\xd0\xb9 \xd0\xbf\xd0\xbe\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb7"
        "\xd0\xb4\xd0\xb5\xd0\xbb \xd0\xb8\xd1\x81\xd0\xba\xd1\x83\xd1\x81"
        "\xd1\x81\xd1\x82\xd0\xb2\xd0\xb5\xd0\xbd\xd0\xbd\xd0\xbe\xd0\xb3"
        "\xd0\xbe \xd0\xb8\xd0\xbd\xd1\x82\xd0\xb5\xd0\xbb\xd0\xbb\xd0"
        "\xb5\xd0\xba\xd1\x82\xd0\xb0, \xd0\xb8\xd0\xb7\xd1\x83\xd1\x87"
        "\xd0\xb0\xd1\x8e\xd1\x89\xd0\xb8\xd0\xb9 \xd0\xbc\xd0\xb5\xd1\x82"
        "\xd0\xbe\xd0\xb4\xd1\x8b \xd0\xbf\xd0\xbe\xd1\x81\xd1\x82\xd1\x80"
        "\xd0\xbe\xd0\xb5\xd0\xbd\xd0\xb8\xd1\x8f \xd0\xb0\xd0\xbb\xd0\xb3"
        "\xd0\xbe\xd1\x80\xd0\xb8\xd1\x82\xd0\xbc\xd0\xbe\xd0\xb2, \xd1\x81"
        "\xd0\xbf\xd0\xbe\xd1\x81\xd0\xbe\xd0\xb1\xd0\xbd\xd1\x8b\xd1\x85 "
        "\xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb0\xd1\x82\xd1\x8c\xd1\x81\xd1"
        "\x8f.")

    vect = CountVectorizer()
    X_counted = vect.fit_transform([document])
    assert_equal(X_counted.shape, (1, 15))

    vect = HashingVectorizer(norm=None, non_negative=True)
    X_hashed = vect.transform([document])
    assert_equal(X_hashed.shape, (1, 2 ** 20))

    # No collisions on such a small dataset
    assert_equal(X_counted.nnz, X_hashed.nnz)

    # When norm is None and non_negative, the tokens are counted up to
    # collisions
    assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data))
Ejemplo n.º 6
0
def vocab_size(texts, min_count=[1,2,3,4,5], visualise=False, save=False):
    """Plots vocab size as a function of minimum letter count

    Args
    ----
    texts: list of Strings
        List of all the texts

    Returns
    -------
    sizes: List of ints
        Size of vocabulary
    """
    sizes = []
    for i in min_count:
        CV = CountVectorizer(min_df = i)
        BoWs = CV.fit_transform(texts)
        sizes.append(BoWs.shape[1])

    if visualise:
        plt.clf()
        plt.plot(min_count, sizes, 'bo-')
        if save:
            plt.savefig("Count_vs_vocabSize.png")

    return sizes
Ejemplo n.º 7
0
def find_common_words(all_words, num_most_frequent_words):
    vectorizer = CountVectorizer(
        stop_words=None, # 'english',
        max_features=num_most_frequent_words,
        binary=True)
    vectorizer.fit(all_words)
    return (vectorizer.vocabulary_, vectorizer.get_feature_names())
def do_vectorize(filenames, tokenizer_fn=tokenize, min_df=1,
                 max_df=1., binary=True, ngram_range=(1,1)):
    """
    Convert a list of filenames into a sparse csr_matrix, where
    each row is a file and each column represents a unique word.
    Use sklearn's CountVectorizer: http://goo.gl/eJ2PJ5
    Params:
        filenames.......list of review file names
        tokenizer_fn....the function used to tokenize each document
        min_df..........remove terms from the vocabulary that don't appear
                        in at least this many documents
        max_df..........remove terms from the vocabulary that appear in more
                        than this fraction of documents
        binary..........If true, each documents is represented by a binary
                        vector, where 1 means a term occurs at least once in 
                        the document. If false, the term frequency is used instead.
        ngram_range.....A tuple (n,m) means to use phrases of length n to m inclusive.
                        E.g., (1,2) means consider unigrams and bigrams.
    Return:
        A tuple (X, vec), where X is the csr_matrix of feature vectors,
        and vec is the CountVectorizer object.
    """
    vectorizer = CountVectorizer(tokenizer=tokenizer_fn, min_df=min_df, max_df=max_df, binary=binary, ngram_range=ngram_range, dtype=int)
    X = vectorizer.fit_transform(filenames)
    return (X, vectorizer)
Ejemplo n.º 9
0
def BoW(texts, vectorizerType="count", min_df=3):
    """Takes a list of texts and creates a BoWs object

    Args
    ----
    texts: List of Strings
        all the texts
    vectorizerType: String
        One of "count" or "tfidf"
    min_df: int
        Minimum number of letters a word must be

    Returns
    -------
    CV: Vectorizer object
        One of CountVectorizer or 
    BoWs: Fitted Vectorizer object
    """

    if vectorizerType == "count":
        CV = CountVectorizer(min_df=min_df)
    elif vectorizerType == "tfidf":
        CV = TfidfVectorizer(min_df=min_df)

    BoWs = CV.fit_transform(texts)

    return CV, BoWs
Ejemplo n.º 10
0
def wordMoverDistance(d1, d2):
    ###d1 list
    ###d2 list
    # Rule out words that not in vocabulary
    d1 = " ".join([w for w in d1 if w in vocab_dict])
    d2 = " ".join([w for w in d2 if w in vocab_dict])
    #print d1
    #print d2
    vect = CountVectorizer().fit([d1,d2])
    feature_names = vect.get_feature_names()
    W_ = W[[vocab_dict[w] for w in vect.get_feature_names()]] #Word Matrix
    D_ = euclidean_distances(W_) # Distance Matrix
    D_ = D_.astype(np.double)
    #D_ /= D_.max()  # Normalize for comparison
    v_1, v_2 = vect.transform([d1, d2])
    v_1 = v_1.toarray().ravel()
    v_2 = v_2.toarray().ravel()
    ### EMD
    v_1 = v_1.astype(np.double)
    v_2 = v_2.astype(np.double)
    v_1 /= v_1.sum()
    v_2 /= v_2.sum()
    #print("d(doc_1, doc_2) = {:.2f}".format(emd(v_1, v_2, D_)))
    emd_d = emd(v_1, v_2, D_) ## WMD
    #print emd_d
    return emd_d
def feature_extraction(in_file_name):
    in_file = codecs.open(in_file_name, 'r','latin-1') #经过预处理的语料
    corpus = []
    while True:
      #遍历文档
      doc = in_file.readline().strip()
      if doc == '': #读到文件尾时会返回空字符串
        break
      corpus.append(doc)
    
    #生成二元,tf,tfidf三种参数的空间
    #注意:可以通过CountVectorizer的max_features参数选择保留的feature数量
    max_features = None
    #max_features = 10000
    bin_vectorizer = CountVectorizer(max_features=max_features, binary=True,min_df=3)#binary occurrence markers
    print 'calculating term occurence feature...'
    term_occurence = bin_vectorizer.fit_transform(corpus) #二元特征
    '''
    tf_vectorizer = CountVectorizer(max_features=max_features,min_df=3)
    term_counts = tf_vectorizer.fit_transform(corpus)
    #tf = normalize(term_counts, axis=1, norm='l2')
    print 'calculating tf feature...'
    tf_transformer = TfidfTransformer(norm='l1', use_idf=False)
    tf = tf_transformer.fit_transform(term_counts) #tf特征
    '''
    '''
    print 'calculating tf-idf feature...'
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(term_counts) #tf-idf特征
    '''
    return bin_vectorizer, term_occurence
Ejemplo n.º 12
0
def getCount(artName):
    artLst = []
    #artDict = {}
    for fn in os.listdir(indir):
        if not fn.endswith('.xml'): continue
        if ':' in fn:
            fn = fn.replace(':','/')
        fn = fn.decode('utf-8')
        #fn = unicodedata.normalize("NFC",fn)
        fn_de = unidecode(fn)
        newfn = fn_de[:-4]
        #print 'artName: ',artName, 'eval: ', newfn
        newfn = newfn.lower()
        if newfn == artName:
            # print "found article begin processing"
            #print fn
            if '/' in fn:
                fn = fn.replace('/',':')
            fullname = os.path.join(indir, fn)
            tree = ET.parse(fullname)
            root = tree.getroot()
            page = root.find('{http://www.mediawiki.org/xml/export-0.7/}page')

            revisions = page.findall('{http://www.mediawiki.org/xml/export-0.7/}revision')
            for s in revisions:
                txt = s.find('{http://www.mediawiki.org/xml/export-0.7/}text')
                artLst.append(txt.text)
            artLst = filter(None,[one for one in artLst])
            # print "processing done; begin counting"
            vectorizer = CountVectorizer(min_df=1,token_pattern='([^\[\|\]\s\.\!\=\{\}\;\<\>\?\"\'\#\(\)\,\*]+)')
            X = vectorizer.fit_transform(artLst)
            artDict = dict(zip(vectorizer.get_feature_names(),np.asarray(X.sum(axis=0)).ravel()))
        
            return artDict
    return -1
Ejemplo n.º 13
0
def get_data(dir):
    titles = []
    titles_label = []
    os.path.walk(dir, visit, [titles, titles_label])
    # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
    vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)
    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of
    # strings.
    titles_vocab_mat = vectorizer.fit_transform(titles)
    # Numpy arrays are easy to work with, so convert the result to an array
    #print vectorizer.vocabulary_  # a dict, the value is the index
    train_data_features = titles_vocab_mat.toarray()
    print train_data_features.shape
    # Take a look at the words in the vocabulary
    vocab = vectorizer.get_feature_names()
    print '/'.join(vocab)
    # Sum up the counts of each vocabulary word
    dist = np.sum(train_data_features, axis=0)
    total_words = 0
    for i in train_data_features:
        #print sum(i)
        total_words += sum(i)
    print total_words
    weka(vocab, dist, train_data_features, total_words, titles_label)
Ejemplo n.º 14
0
def get_feature_by_opcode():
    global white_count
    global black_count
    global max_features
    global webshell_dir
    global whitefile_dir
    print "max_features=%d webshell_dir=%s whitefile_dir=%s" % (max_features,webshell_dir,whitefile_dir)
    x=[]
    y=[]

    webshell_files_list = load_files_opcode_re(webshell_dir)
    y1=[1]*len(webshell_files_list)
    black_count=len(webshell_files_list)

    wp_files_list =load_files_opcode_re(whitefile_dir)
    y2=[0]*len(wp_files_list)

    white_count=len(wp_files_list)


    x=webshell_files_list+wp_files_list
    #print x
    y=y1+y2

    CV = CountVectorizer(ngram_range=(2, 4), decode_error="ignore",max_features=max_features,
                                       token_pattern = r'\b\w+\b',min_df=1, max_df=1.0)

    x=CV.fit_transform(x).toarray()

    return x,y
Ejemplo n.º 15
0
def check_webshell(clf,dir):
    all=0
    all_php=0
    webshell=0

    webshell_files_list = load_files_re(webshell_dir)
    CV = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", max_features=max_features,
                         token_pattern=r'\b\w+\b', min_df=1, max_df=1.0)
    x = CV.fit_transform(webshell_files_list).toarray()

    transformer = TfidfTransformer(smooth_idf=False)
    transformer.fit_transform(x)


    g = os.walk(dir)
    for path, d, filelist in g:
        for filename in filelist:
            fulepath=os.path.join(path, filename)
            t = load_file(fulepath)
            t_list=[]
            t_list.append(t)
            x2 = CV.transform(t_list).toarray()
            x2 = transformer.transform(x2).toarray()
            y_pred = clf.predict(x2)
            all+=1
            if filename.endswith('.php'):
                all_php+=1
            if y_pred[0] == 1:
                print "%s is webshell" % fulepath
                webshell+=1

    print "Scan %d files(%d php files),%d files is webshell" %(all,all_php,webshell)
def bayes_tfidf(prefix, sufix, dic_fn):
    """
    prefix example: ./data/single_label_sen/sen_spanish_protest
    sufix example: pop_cat
    """

    train_file = prefix + "_train.txt.tok"
    test_file = prefix + "_test.txt.tok"

    train_y_file = prefix + "_train." + sufix
    test_y_file = prefix + "_test." + sufix
    
    dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))}


    word_train_set = [l.strip().lower() for l in open(train_file)]
    word_test_set = [l.strip().lower() for l in open(test_file)]

    train_y = [dic_cn[l.strip()] for l in open(train_y_file)]
    test_y = [dic_cn[l.strip()] for l in open(test_y_file)]

    # construct the word count matrix
    count_vect = CountVectorizer()
    train_set_count = count_vect.fit_transform(word_train_set)
    test_set_count = count_vect.transform(word_test_set)

    # construct tfidf matrix
    tfidf_transformer = TfidfTransformer()
    train_set_x = tfidf_transformer.fit_transform(train_set_count)
    test_set_x = tfidf_transformer.transform(test_set_count)

    print "start the model"
    test_score = bayes_experiment([train_set_x, train_y], [test_set_x, test_y])
    return test_score
Ejemplo n.º 17
0
def get_feature_by_bag_tfidf():
    global white_count
    global black_count
    global max_features
    print "max_features=%d" % max_features
    x=[]
    y=[]

    webshell_files_list = load_files_re(webshell_dir)
    y1=[1]*len(webshell_files_list)
    black_count=len(webshell_files_list)

    wp_files_list =load_files_re(whitefile_dir)
    y2=[0]*len(wp_files_list)

    white_count=len(wp_files_list)


    x=webshell_files_list+wp_files_list
    y=y1+y2

    CV = CountVectorizer(ngram_range=(2, 4), decode_error="ignore",max_features=max_features,
                                       token_pattern = r'\b\w+\b',min_df=1, max_df=1.0)
    x=CV.fit_transform(x).toarray()

    transformer = TfidfTransformer(smooth_idf=False)
    x_tfidf = transformer.fit_transform(x)
    x = x_tfidf.toarray()

    return x,y
def train_vectorizer(corpus, max_features=10000):
    """ Train the vectorizer """
    print "training the vectorizer..."
    vectorizer = CountVectorizer(decode_error='ignore', max_features=max_features)
    vectorizer.fit(corpus)
    print "ok"
    return vectorizer
def work_with_simple_bag_of_words():
    count = CountVectorizer()
    docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining and the weather is sweet',
    ])
    bag = count.fit_transform(docs)
    print(count.vocabulary_)
    print(bag.toarray())

    np.set_printoptions(precision=2)
    tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
    print(tfidf.fit_transform(bag).toarray())

    tf_is = 2
    n_docs = 3
    idf_is = np.log((n_docs+1) / (3+1))
    tfidf_is = tf_is * (idf_is + 1)
    print("tf-idf of term 'is' = %.2f" % tfidf_is)

    tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
    raw_tfidf = tfidf.fit_transform(bag).toarray()[-1]
    print(raw_tfidf)

    l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
    print(l2_tfidf)
Ejemplo n.º 20
0
def bag_of_words_to_list(lines,max_features):
    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool
    # removing stopwords
    vectorizer = CountVectorizer(
        stop_words = 'english'
        ,max_features = max_features
        )
    
    #TfidfVectorizer i need to check this

    print('>> Removing stopwords...')
    # lets remove stopwords
    lines = remove_stopwords(lines,2)

    print('>> Stemming...')
    # lets stem it
    lines =stemming(lines,3)

    print('>> Doing bag of words...')
    #lets do the bag of words
    bag_of_words = vectorizer.fit_transform(lines)



    #uncomment to visualize the words and how many times are used
    #printing_bow(bag_of_words,vectorizer)

    return(vectorizer.get_feature_names(),bag_of_words.toarray())
Ejemplo n.º 21
0
def race_tfidf(data, can_be_noun_arg, stop_words):
    print 
    data = data.groupby('race')['last']
    data = dict(list(data))
    docs = []
    for k in data:
        docs.append(' '.join(data[k]))
    count_vectorizer = CountVectorizer(stop_words='english')
    counts = count_vectorizer.fit_transform(docs)
    #print counts.todense().shape
    tfidf = TfidfTransformer(norm="l2", sublinear_tf='True')
    tfidf.fit(counts)
    #print "IDF:", tfidf.idf_.shape
    tf_idf_matrix = tfidf.transform(counts)
    freqs = {}
    sorted_voc = sorted(count_vectorizer.vocabulary_.iteritems(), key=operator.itemgetter(1))
    terms,_ = zip(*sorted_voc)
    for i,k in enumerate(data.keys()):
        # make list
        row = np.array(tf_idf_matrix.todense()[i,:])[0].tolist()
        freq = zip(terms, row)
        freqs[k] = sorted(freq, reverse=True, key=lambda x: x[1])
        print freqs[k][:5]
    #print tf_idf_matrix.todense().shape
    return freqs
Ejemplo n.º 22
0
def getFeature():
    with open(os.path.join('spam_filter_train.txt'), 'r') as f:
        trainData = f.readlines()
    with open(os.path.join('spam_filter_test.txt'), 'r') as f:
        testData = f.readlines()
    data = trainData + testData
    trainNum, testNum = len(trainData), len(testData)
    del trainData
    del testData

    for i in range(len(data)):
        data[i] = data[i].replace('\n', '').split('\t')[1]
    # lemmatize
    lemmatized = []
    wnl = WordNetLemmatizer()
    for line in data:
        lemmatized.append([wnl.lemmatize(word) for word in line.split(' ')])
    # remove stopwords
    stopwordRemoved = []
    sw = set(stopwords.words('english'))
    for line in lemmatized:
        stopwordRemoved.append(' '.join([x for x in line if x not in sw]))
    # tf feature
    vec = CountVectorizer()
    features = vec.fit_transform(stopwordRemoved)

    with open('trainFeatures.pkl', 'wb') as f:
        cPickle.dump(features[:trainNum], f)
    with open('testFeatures.pkl', 'wb') as f:
        cPickle.dump(features[trainNum:], f)
Ejemplo n.º 23
0
def produceLDATopics():
    '''
    Takes description of each game and uses sklearn's latent dirichlet allocation and count vectorizer
    to extract topics.
    :return: pandas data frame with topic weights for each game (rows) and topic (columns)
    '''
    data_samples, gameNames = create_game_profile_df(game_path)
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(data_samples)
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    topics = lda.fit_transform(tf)
    # for i in range(50):
    #     gameTopics = []
    #     for j in range(len(topics[0])):
    #         if topics[i,j] > 1.0/float(n_topics):
    #             gameTopics.append(j)
    #     print gameNames[i], gameTopics
    topicsByGame = pandas.DataFrame(topics)
    topicsByGame.index = gameNames
    print topicsByGame

    tf_feature_names = tf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([tf_feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

    return topicsByGame
Ejemplo n.º 24
0
    def setTestInputforNN(self, collection={}, sel_words=[]):
        list_of_strings = []
        list_of_salary = []
        count = 0
        sel_words_set = set(sel_words)
        sel_words_list = list(sel_words_set)
        for document in collection:
            count += 1
            title = document.getTitle()
            description = document.getDescription()
            salary = (int)(document.getSalaryNorm())
            words = re.split(" ", title) + re.split(" ", description)
            # words = [x for x in words if x in sel_words]
            wordsUnique = set(words)
            wordsUnique = wordsUnique & sel_words_set
            words = [x for x in words if x in wordsUnique]
            documentString = " ".join(words)
            list_of_strings.append(documentString)
            list_of_salary.append(salary)

            if not (count % 15000):
                break

        vectorizer = CountVectorizer(vocabulary=sel_words, min_df=1)
        self.inp = vectorizer.fit_transform(list_of_strings)
        from sklearn.externals import joblib

        joblib.dump(self.inp.tocsr(), "test_dataset_in.joblib")

        self.inp_size = len(list_of_strings)
        output = np.array(list_of_salary)
        self.target = output.reshape(len(list_of_strings), 1)
        joblib.dump(self.target, "test_dataset_out.joblib")

        return [self.inp, self.target]
Ejemplo n.º 25
0
def bagOfWord(X):
    vectorizer = CountVectorizer(min_df=8, token_pattern=r"(?u)\b\w+\b")
    X = vectorizer.fit_transform(X)
    with open('./model/vectorizer.pkl','wb') as fr:
        print('save text vectorizer to ./model/')
        pickle.dump(vectorizer,fr)
    return X
Ejemplo n.º 26
0
def textExtraction(df, series):
    vectorizer = CountVectorizer(analyzer = text_process, min_df = 0.1)
    df[series] = df[series].replace(np.nan, '', regex=True)
    vectorizer.fit_transform(df[series])
    vocab = vectorizer.get_feature_names()
    
    return vocab
Ejemplo n.º 27
0
	def feature_extraction(self):
		vectorizer = CountVectorizer(binary=True, stop_words='english')
		corpus = []
		for doc in self.person.doc_list:
			corpus.append(doc.title + ' ' + doc.snippet)
		self.train = vectorizer.fit_transform(corpus)
		return vectorizer
Ejemplo n.º 28
0
def bag_of_words():
    twenty_train = pickle.load("twenty_train.p")
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(twenty_train.data)
    X_train_counts.shape
    count_vect.vocabulary_.get(u'algorithm')
    occurrences_to_frequencies(X_train_counts, twenty_train, count_vect)
Ejemplo n.º 29
0
	def __init__(self, plot_vectorizer = 'count', tokenizer = None, lda = False, use_genre_vecs = False):
		t = None
		if tokenizer is 'named_entity':
			t = NETokenizer()
		elif tokenizer is 'lemma':
			t = LemmaTokenizer()
		self.use_genre_vecs = use_genre_vecs
		self.binary = plot_vectorizer is 'binary'
		if plot_vectorizer is 'tfidf':
			self.vectorizer = TfidfVectorizer(analyzer = "word",   \
				tokenizer = t,    \
				preprocessor = None, \
				stop_words = 'english')
		elif plot_vectorizer is 'binary':
			self.vectorizer = CountVectorizer(analyzer = "word",	\
				tokenizer = t,	\
				preprocessor = None, \
				stop_words = 'english', \
				binary = True)
		else:
			self.vectorizer = CountVectorizer(analyzer = "word",   \
				tokenizer = t,    \
				preprocessor = None, \
				stop_words = 'english')
		if lda:
			self.lda = LatentDirichletAllocation(n_topics=20, max_iter=2,	\
				learning_method='online', learning_offset=10.,	\
				random_state=0)
		else:
			self.lda = None
Ejemplo n.º 30
0
def token_count_pac(pac_id,           \
                    limit       = 'ALL', \
                    ngram_range = (2,2), \
                    min_df      = 5):
    
    conn    = psql.connect("dbname='keyword-influence'")
    cursor  = conn.cursor()

    cursor.execute("SELECT id, speaking                       \
                    FROM words                                \
                    WHERE id IN (                             \
                        SELECT id                             \
                        FROM words                            \
                        WHERE bioguide_id IN(                 \
                            SELECT bioguide_id                \
                            FROM pac_contrib as pc            \
                            INNER JOIN congress as c          \
                            ON pc.fec_candidate_id = c.fec_id \
                            WHERE pac_id = '"+ pac_id +"'));")
    sql_result = cursor.fetchall()

    counter   = CountVectorizer(stop_words  = corpus.stopwords.words('english'), \
                                ngram_range = ngram_range,                       \
                                min_df      = min_df)
    chunks    = map(lambda x: x[1], sql_result)
    counts    = counter.fit_transform(chunks)
    vocab     = counter.get_feature_names()
    vocab     = dict(zip(range(len(vocab)),vocab))
    
    return [counts, vocab]
Ejemplo n.º 31
0
train = pd.read_csv("../datasets/dataset_1/train.csv",
                    header='infer',
                    index_col=None)
x_train, x_test, y_train, y_test = train_test_split(train["SentimentText"],
                                                    train["Sentiment"],
                                                    random_state=1000,
                                                    test_size=0.3)

# nb_classes= np.max(y_train) + 1
# from keras.utils import np_utils
# Y_train = np_utils.to_categorical(y_train, nb_classes)
# Y_test = np_utils.to_categorical(y_test, nb_classes)

# >>> COUNT VECTORIZER >>>
count_vect = CountVectorizer()
X = count_vect.fit_transform(x_train)
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)


# >>> TUNING BATCH SIZE AND EPOCHS >>>
def create_model():
    # Build the model
    model = Sequential()
    input_dim = X.shape[1]
    model.add(Dense(512, input_dim=input_dim))
    model.add(Dense(num_classes))
def kfold_cross(data):
    kf = KFold(n_splits=30)
    num_test = 1

    #for avg
    accuracy_tot = 0
    precision_tot = 0
    recall_tot = 0
    f_p_tot = 0
    f_measure_tot = 0

    for train_index, test_index in kf.split(data):
        #create train and test list
        train_msg = list()
        train_classe = list()
        test_msg = list()
        test_classe = list()

        print("\nStarting test N°", str(num_test))
        for i in train_index:
            train_msg.append(data[i][0])
            train_classe.append(data[i][1])
        for j in test_index:
            test_msg.append(data[j][0])
            test_classe.append(data[j][1])

        #call count vectorizer that will create a matrix of token counts
        v = CountVectorizer()
        train_matrix = v.fit_transform(train_msg)

        #call multinomial naive bayes
        clf = MultinomialNB()
        clf.fit(train_matrix, train_classe)

        #create a matrix for the test list
        test_matrix = v.transform(test_msg)

        #predict with multinomial the test set
        predicted = clf.predict(test_matrix)

        #create a matrix of dim [2,2]
        conf_matrix = confusion_matrix(test_classe, predicted)
        tn, fp, fn, tp = conf_matrix.ravel()

        accuracy = (accuracy_score(test_classe, predicted)) * 100
        precision = precision_score(test_classe,
                                    predicted,
                                    pos_label='malware')
        recall = recall_score(test_classe, predicted, pos_label='malware')

        f_p_rate = fp / (fp + tn)
        f_measure = 2 * (precision * recall) / (precision + recall)

        accuracy_tot += accuracy
        precision_tot += precision
        recall_tot += recall
        f_p_tot += f_p_rate
        f_measure_tot += f_measure

        #print all the results
        print('Confusion matrix:')
        print(confusion_matrix(test_classe, predicted))
        print('Accuracy is: ' + str(accuracy)[:5] + '%')
        print('Precision: ' + str(precision)[:5])
        print('Recall: ' + str(recall)[:5])
        print('False-positive rating: ' + str(f_p_rate)[:5])
        print('F-measure: ' + str(f_measure)[:5])
        num_test += 1

    print('\nPerformance Evaluation, AVG Values:')
    print('Accuracy: ' + str(accuracy_tot / 30)[:5] + '%')
    print('Precision: ' + str(precision_tot / 30)[:5])
    print('Recall: ' + str(recall_tot / 30)[:5])
    print('False-positive rating: ' + str(f_p_tot / 30)[:5])
    print('F-measure: ' + str(f_measure_tot / 30)[:5])
email_data = email_data.loc[email_data.text != " ",:]
##There are no empty spaces

##Creating a matrix of token counts for the entire text document
def split_if_words(i):
    return [word for word in i.split(" ")]

predictors = email_data.iloc[:,1]
target = email_data.iloc[:,0]
#Splitting the data
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(predictors, target, test_size = 0.3, stratify = target)

##Convert email text into word count matric i.e bag of words
email_bow = CountVectorizer(analyzer = split_if_words).fit(email_data["text"])

##For all the emails doing the transformation

all_emails_matrix = email_bow.transform(email_data["text"])
all_emails_matrix.shape
#(5559, 6661)

##For training data
train_emails_matrix = email_bow.transform(x_train)
train_emails_matrix.shape
#(3891, 6661)

##For test data
test_emails_matrix = email_bow.transform(x_test)
test_emails_matrix.shape
Ejemplo n.º 34
0
    all_the_text = all_the_text.decode('utf-8', 'ignore')

    essay_list = re.split('#@\d+', all_the_text)


    file_object = open('NGSL_lemmatized')
    try:
        all_the_text = file_object.read( )
    finally:
        file_object.close( )
    Dict = all_the_text.split()
    punct = [',','.','?','!',':','\'','"']
    #Dict.append(punct)

    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(Dict)

    charac_num = [] #0. number of characters
    tokens_num = [] #1. number of tokens
    difftokens_num = [] #2. number of different tokens
    diff_to_tokens = [] #3. 2/1
    sentence_num = [] #4.number of sentences
    aver_charac = [] #5. 0/1
    aver_tokens = [] #6. 1/5
    words_in_NGSL = [] #7. ratio of words in NGSL
    E1_percent = [] #8.ratio of E1 error
    bag_of_words = [] #9.bag_of_wods

    feature_vector = []
Ejemplo n.º 35
0
import jieba
import jieba.posseg as pseg
import os
import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import json

if __name__ == "__main__":
    #     corpus=["我 來到 北京 清華大學", # 第一類文本斷詞後的结果,詞之間以空格隔開
    #         "他 來到 了 網易 杭研 大廈", # 第二類文本的斷詞結果
    #         "小明 碩士 畢業 於 中國 科學院", # 第三類文本的斷詞結果
    #         "我 愛 北京 天安門"] # 第四類文本的斷詞結果
    corpus = []
    with open("E:/AB104/Expedia/Hotels-City-Suites-Kaohsiung-Chenai_comments.json", "r") as a:
        Com_list = json.load(a)
        for i in Com_list:
            for j in i["comment_collection"]:
                corpus.append(j["comment"])

    vectorizer = CountVectorizer(ngram_range=(2,2))  # 該類會將文本中的詞語轉換為詞頻矩陣,矩陣元素a[i][j] 表示j詞在i類文本下的詞頻
    transformer = TfidfTransformer()  # 該類會統計每個詞語的 tf-idf 權重
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))  # 第一個 fit_transform是計算 tf-idf,第二個 fit_transform是將文本轉為詞頻矩陣
    word = vectorizer.get_feature_names()  # 獲取詞代模型中的所有詞語
    weight = tfidf.toarray()  # 將tf-idf矩陣抽取出来,元素a[i][j]表示j詞在i類文本中的tf-idf權重
    for i in range(len(weight)):  # 打印每類文本的tf-idf詞語權重,第一個 for便利所有文本,第二個 for便利某一類文本下的詞語權重
        print u"-------這裡輸出第", i + 1, u"類文本的詞語tf-idf權重------"
        for j in range(len(word)):
            print word[j], weight[i][j]
Ejemplo n.º 36
0
# In[35]:


# using countvectorizer to convert a collection of text documents to a matrix of token counts


# In[36]:


from sklearn.feature_extraction.text import CountVectorizer


# In[37]:


bow_transformer=CountVectorizer(analyzer=text_process).fit(messages['message'])


# In[38]:


print(len(bow_transformer.vocabulary_))


# In[39]:


#example text message and get its bag-of-words counts as a vector 


# In[40]:
Ejemplo n.º 37
0
data = pd.read_csv("train.csv")


def show_topic(model, feature_names, top):
    for index, distribution in enumerate(model.components_):
        sorted_word_indices = distribution.argsort()[::-1][:top]
        print(f"Topic {index}:")
        print(" ".join([feature_names[i] for i in sorted_word_indices]))


# CountVectorizer
tf_vectorizer = CountVectorizer(
    # set up your CountVectorizer
    tokenizer=lambda text: [
        token.text for token in nlp(text)
        if not token.is_stop and not token.is_punct
    ],
    max_df=.8,  # 0.8(float) * 25,000
    min_df=25,  # 25(int) out of 25,000
    max_features=1000,  #top 1000 tokens
)
tf = tf_vectorizer.fit_transform(data["review"])

lda = LatentDirichletAllocation(
    # set up your LatentDirichletAllocation
    n_components=20,  # 20 out of 25,000 can be any other number
    learning_method="online",
    learning_offset=50,
    random_state=2020,
)
lda.fit(tf)
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

#Exploring the data
emails = fetch_20newsgroups(categories = ['rec.sport.baseball', 'rec.sport.hockey'])
print(emails.target_names)
print(emails.data[5])
print(emails.target[5]) #1
print(emails.target_names) #rec.sport.hockey

#Making the Training and Test Sets
train_emails = fetch_20newsgroups(categories = ['comp.sys.ibm.pc.hardware','rec.sport.hockey'], subset = 'train', shuffle = True, random_state = 108)

test_emails = fetch_20newsgroups(categories = ['comp.sys.ibm.pc.hardware','rec.sport.hockey'], subset = 'test', shuffle = True, random_state = 108)

#Counting words
counter = CountVectorizer()
counter.fit(test_emails.data + train_emails.data)
train_counts = counter.transform(train_emails.data)
test_counts = counter.transform(test_emails.data)

#Making a Naive Bayes Classifier
classifier = MultinomialNB()
classifier.fit(train_counts, train_emails.target)
print(classifier.score(test_counts, test_emails.target)) #0.9974

#Test other datasets

Ejemplo n.º 39
0
def linear_SVM(x_train,y_train,x_test,y_test):
	sgd = Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))])
	sgd.fit(x_train,y_train)
	y_pred = sgd.predict(x_test)
	print("Linear SVM: "+str(accuracy_score(y_pred,y_test)))
	print(classification_report(y_test, y_pred,target_names=flairs))
print(twt)
sentiment = model.predict(twt,batch_size=1,verbose = 7)[0]
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")
    
train_clean_tweet=[]
for tweet in train['TweetText']:
    train_clean_tweet.append(tweet)
test_clean_tweet=[]
for tweet in test['TweetText']:
    test_clean_tweet.append(tweet)
        
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(analyzer = "word")
train_features= v.fit_transform(train_clean_tweet)
test_features=v.transform(test_clean_tweet)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
Classifiers = [
    LogisticRegression(C=0.000000001,solver='liblinear',max_iter=200),
    KNeighborsClassifier(3),
Ejemplo n.º 41
0
from flask import Flask, render_template, request
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import pickle
df = pd.read_csv('movie.csv')
similarity_matrix = pickle.load(open('similarity_matrix.pkl', 'rb'))
cv = CountVectorizer(stop_words='english')
app = Flask(__name__)


@app.route('/')
def hello():
    return render_template('index.html')


@app.route('/recommend', methods=['POST'])
def recommend():
    movie_name = request.form.get('movie_name')
    movie_list = recommender(movie_name)
    return render_template('index.html', movie_list=movie_list)


def recommender(movie_name):
    # find the index of this movie
    index_pos = df[df['title'] == movie_name].index[0]
    # calculate similarity
    recommended_movie_index = sorted(list(
        enumerate(similarity_matrix[index_pos])),
                                     reverse=True,
Ejemplo n.º 42
0
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
import joblib

df = pd.read_csv(r'C:\Users\meysam-sadat\Downloads\Compressed\emails.csv')

my_vectorize = CountVectorizer()

x = my_vectorize.fit_transform(df['text'])
y = df.spam.values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
truncated = TruncatedSVD(n_components=1000)
x_train = truncated.fit_transform(x_train)
x_test = truncated.transform(x_test)

clf = MLPClassifier(hidden_layer_sizes=(200, 50))
clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))

joblib.dump(clf, 'spam_pam_detector.joblib')
model = joblib.load('spam_pam_detector.joblib')
prediction = model.predict(x_test)
# ```
# 
# 
# Aplique `CountVectorizer` ao _data set_ `newsgroups` e descubra o número de vezes que a palavra _phone_ aparece no corpus. Responda como um único escalar.

# In[42]:


categories = ['sci.electronics', 'comp.graphics', 'rec.motorcycles']
newsgroup = fetch_20newsgroups(subset="train", categories=categories, shuffle=True, random_state=42)


# In[43]:


vectorizer = CountVectorizer()
X = vectorizer.fit_transform(newsgroup.data)


# In[44]:


df_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
df_words['phone'].sum()


# In[45]:


def q6():
    df_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
Ejemplo n.º 44
0
def naiveBayes(x_train,y_train,x_test,y_test):
	nb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
	nb.fit(x_train,y_train)
	y_pred = nb.predict(x_test)
	print("Naive Bayes: "+str(accuracy_score(y_pred,y_test)))
	print(classification_report(y_test, y_pred,target_names=flairs))
Ejemplo n.º 45
0
		for j in range(col):
			if j < i:
				prod[index] = np.inner(X[:,i], X[:,j])
				index += 1
	return prod

# Define the original dimension
n = int()
m = int(5000)

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
# categories = ['sci.med']

textdata = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

count_vect = CountVectorizer()
word_counts = count_vect.fit_transform(textdata.data)

tf_transformer = TfidfTransformer(use_idf=False).fit(word_counts)
# the results is a scipy.sparse.csr.csr_matrix
X = tf_transformer.transform(word_counts)
# Clip the dataset
X = X[:,:m]
# normalize and transpose the dataset
X = normalize(X, 'l2')
X = X.asfptype()
X = X.transpose()
print("The text data fequency matrix size is:", X.shape)
prod_init = get_inner_product(scipy.eye(m)*X)
[dummy, n] = X.shape
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

stop_words = ['a', 'of', 'for']

# ngram_range控制n-gram的n,例如ngram_range=(1,2)即表示抽取uni-gram和bi-gram特征
# max_features控制词表的大小,若不为空,则按照词频进行排序后在作截取
count_vec = CountVectorizer(
    tokenizer=word_tokenize, stop_words=stop_words, max_features=100, ngram_range=(1, 3))
count_vec.fit(documents)
# 或者使用fit_transform
# sparse_matrix = count_vec.fit_transform(documents)

# count_vec对应的字典
count_vec.vocabulary_
print(len(count_vec.vocabulary_))
print(count_vec.vocabulary_['trees'])  # 'trees'对应的id

# 稀疏矩阵
sparse_matrix = count_vec.transform(documents)

# 转为稠密矩阵形式
matrix = sparse_matrix.toarray()
Ejemplo n.º 47
0
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
corpus = []
for i in range(0, 1000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)




root = Tk()
root.title('Review Analyser')
root.geometry("500x500")
Ejemplo n.º 48
0
    LogisticRegression(random_state=0,
                       solver="lbfgs",
                       multi_class="auto",
                       max_iter=5000,
                       n_jobs=-1),
    MultinomialNB(),
    DecisionTreeClassifier(random_state=0),
    SVC(kernel="rbf", gamma="scale"),
    KNeighborsClassifier(n_neighbors=3, n_jobs=-1),
    RandomForestClassifier(n_estimators=100, n_jobs=-1)
]

# Vectorizers
vectorizers = [
    CountVectorizer(input="content",
                    encoding="utf-8",
                    tokenizer=lambda x: x.split(",")),
    TfidfVectorizer(analyzer="word", token_pattern=r"([^,]+)"),
    CountVectorizer(input="content",
                    encoding="utf-8",
                    tokenizer=lambda x: x.split(","),
                    ngram_range=(3, 3)),
    TfidfVectorizer(analyzer="word",
                    token_pattern=r"([^,]+)",
                    ngram_range=(3, 3)),
    CountVectorizer(input="content",
                    encoding="utf-8",
                    tokenizer=lambda x: x.split(","),
                    ngram_range=(4, 4)),
    TfidfVectorizer(analyzer="word",
                    token_pattern=r"([^,]+)",
Ejemplo n.º 49
0





    # ****** Create a bag of words from the training set
    #
    print "Creating the bag of words...\n"


    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.
    vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of
    # strings.
    train_data_features = vectorizer.fit_transform(d_train)

    # Numpy arrays are easy to work with, so convert the result to an
    # array
    train_data_features = train_data_features.toarray()

    # ******* Train a random forest using the bag of words
    #
	def bag_of_words(self):
		self.bag_of_words_matrix = CountVectorizer().fit_transform(self.sentences)
		if(debug):
			return self.bag_of_words_matrix
Ejemplo n.º 51
0
# 获取权重
trainFilePath = "bydata-train_03"
sample_weights_array = weights(trainFilePath)
print(sample_weights_array)

# 读取文件
train = datasets.load_files("bydata-train_03")# 训练文件
test = datasets.load_files("bydata-test")# 测试文件

# print( "train target:", train.target_names[:])# target_names:类别名称
# print ("index:", train.target[:])# target:target_names:类别名称的索引

# 对文档分词,形成词汇表,然后对新文档就可以使用这个词汇表进行编码,
# 最终将会返回一个长度等于词汇表长度的向量,每个数字表示单词在文档中出现的次数
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train.data)
# print ('X_train_counts:', X_train_counts.toarray())

## one step
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None).\
     fit(X_train_tfidf, train.target, sample_weight=np.array(sample_weights_array))# 分类器

X_new_counts = count_vect.transform(test.data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)# 预测类型的索引
print('predicted:', predicted)
Ejemplo n.º 52
0
topic summary
"""

import pickle
import random

# for consistent testing
random.seed(1532525625823)

raw_data = pickle.load(open("pickles/list-of-reviews.p", "rb"))

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords

count_vect = CountVectorizer(stop_words=set(stopwords.words('english')))
train_counts = count_vect.fit_transform(random.sample(raw_data, 30000))

raw_data = None
btr = pickle.load(open("pickles/dict-of-business-to-reviews.p", "rb"))

test_counts = count_vect.transform(btr["Appliance Service Center"])

tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
test_tfidf = tfidf_transformer.transform(test_counts)

dtm = train_tfidf
dtm_test = test_tfidf

vocab = count_vect.get_feature_names()
Ejemplo n.º 53
0
def get_vectorizer(conf):
    if conf["pond"] == None:
        V = CountVectorizer(ngram_range=conf["N"], analyzer=conf["Tok"])
    elif conf["pond"] == "tf-idf":
        V = TfidfVectorizer(ngram_range=conf["N"])
    return V
Ejemplo n.º 54
0
from sklearn import svm
import numpy as np
from sklearn.model_selection import GridSearchCV

test_file = 'test_data.txt'

strategy_instance = helper.strategy()

class_0 = strategy_instance.class0
class_1 = strategy_instance.class1

class_all = class_0 + class_1

class_all = [' '.join(i) for i in class_all]

vectorizer = CountVectorizer(stop_words=None)
vectorizer.fit(class_all)

##print(vectorizer.vocabulary_)

tmp_dic = vectorizer.vocabulary_

word_list = sorted(tmp_dic, key=lambda x: tmp_dic[x])

train_data = vectorizer.transform(class_all).toarray()
train_label = np.array([0] * 360 + [1] * 180)

test = []
with open(test_file) as f:
    for line in f:
        test.append(line)
Ejemplo n.º 55
0
def I_BOW(data_time=['2014-07-01', '2014-12-31'],
          vec_time=['2014-07-01', '2014-12-31'],
          max_features=100,
          fit=False):
    global weibo_train_data
    global weibo_predict_data
    global features_log
    print "loading data..."
    if data_time[0] > '2014-12-31':
        data = weibo_predict_data.copy()
        data['context_clean'] = pd.Series.from_csv(
            '../data/predict_context_clean.csv')
    else:
        data = weibo_train_data.copy()
        data['context_clean'] = pd.Series.from_csv(
            '../data/train_context_clean.csv')
    data.context_clean = data.context_clean.apply(lambda x: json.loads(x))
    data.context_clean = data.context_clean.apply(lambda x: ' '.join(x))
    if fit == True:
        print 'fitting and transforming...'
        data_time = vec_time
        vectorizer = CountVectorizer(analyzer = "word",   \
                                   tokenizer = None,    \
                                   preprocessor = None, \
                                   stop_words = None,   \
                                   max_features=max_features)
        features = vectorizer.fit_transform(                  \
             data[(data['time']<=data_time[1]) 		\
             & (data['time']>=data_time[0])].context_clean)
        print 'saving models...'
        joblib.dump(
            vectorizer, '../others/' + '_'.join(vec_time) + '_' +
            str(max_features) + '.vectorizer')
    else:
        print 'transforming...'
        vectorizer = joblib.load('../others/' + '_'.join(vec_time) + '_' +
                                 str(max_features) + '.vectorizer')
        features = vectorizer.transform(                  \
           data[(data['time']<=data_time[1]) 		\
           & (data['time']>=data_time[0])].context_clean)
    columns = ['I_BOW_' + str(i + 1) for i in range(max_features)]
    features = pd.DataFrame(features.toarray(), columns=columns)
    # write log
    print 'saving features...'
    feature_name = 'I_BOW_' + '_'.join(data_time) + '_' + '_'.join(
        vec_time) + '_' + str(max_features)
    feature_address = '../features/' + feature_name + '.feature'
    features.to_csv(feature_address)
    usage = "train" if fit == True else "test"
    description = "Bag of Words in word count from "+str(data_time[0])+" to "+ \
    data_time[1]+" using top "+str(max_features)+" words"

    print "writing logs..."
    log = [
        feature_name, 'I_BOW', data_time, {
            'max_features': max_features,
            'vec_time': vec_time
        }, 'I', feature_address, usage, description,
        list(features.values.shape)
    ]
    writeLog(log, "features_log")

    return features
Ejemplo n.º 56
0
list1 = []
for i in range(0, 5536):
    mail = df.Message[i]
    #print(mail)
    mail = re.sub('[^a-zA-Z]', ' ', mail)
    mail = mail.lower()
    mailwords = mail.split()
    mailwords = [
        ps.stem(word) for word in mailwords
        if word not in stopwords.words('english')
    ]
    mail = ' '.join(mailwords)
    list1.append(mail)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(list1).toarray()
y = df.Status.values

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
def classify():
    #    inp1 = raw_input("Enter No. of Positive Examples (Total Positive Sample is 69): ")
    #    inp2 = raw_input("Enter No. of Negative Examples (Total Positive Sample is 69): ")
    inp = raw_input(
        "Enter 'a' for inclusion statements or 'b' for interventions or 'c' for exclusion statements: "
    )

    inp1 = 67
    inp2 = 67
    trn_data = []
    trn_cat = []
    p1 = 0
    p2 = 0
    p3 = 0

    if inp == 'a':
        stng = 'training_corpora/inclusion_statements/jaccard/'
    elif inp == 'b':
        stng = 'training_corpora/interventions/jaccard/'
    elif inp == 'c':
        stng = 'training_corpora/exclusion_statements/jaccard/'
    else:
        print("Wrong input")
        exit

# Preparing Positive Training Samples
    for i in range(1, int(inp1) + 1):
        trn_pos = stng + 'positive/pv' + str(
            i) + '.txt'  # File location (training using Jaccard)
        text = codecs.open(trn_pos, encoding='utf-8', mode='r').readlines()
        text = ''.join(text)
        sentences = tokenize.sent_tokenize(text)
        for s in sentences:
            s = re.sub('[^a-zA-Z0-9.?:!$\n]', ' ',
                       s)  # Remove special characters
            trn_data.append(s)
            trn_cat.append(0)
            p1 = p1 + 1

# Preparing Negative Training Samples
    for i in range(1, int(inp2) + 1):
        trn_neg = stng + 'negative/ng' + str(
            i) + '.txt'  # File location (training using Jaccard)
        text = codecs.open(trn_neg, encoding='utf-8', mode='r').readlines()
        text = ''.join(text)
        sentences = tokenize.sent_tokenize(text)
        for s in sentences:
            s = re.sub('[^a-zA-Z0-9.?:!$\n]', ' ',
                       s)  # Remove special characters
            trn_data.append(s)
            trn_cat.append(1)
            p2 = p2 + 1

# A pipeline of different parameters of the classifier
    pipeline = Pipeline([
        ('vect', CountVectorizer(token_pattern=r'\b\w+\b')),
        ('tfidf', TfidfTransformer()),
        ('svr', svm.SVC(kernel='linear', class_weight='balanced')),
    ])

    # Fix the values of the parameters using Grid Search and cross validation on the training samples
    parameters = {
        'vect__min_df': (2, 3),
        'vect__ngram_range': ((1, 2), (1, 3)),  # unigrams or bigrams
        'tfidf__use_idf': (True, False),
        'svr__C': (0.1, 10, 100, 1000),
    }
    grid = grid_search.GridSearchCV(pipeline, parameters, cv=10)
    grid.fit(trn_data, trn_cat)
    #    print("The best classifier is: ", grid.best_estimator_)
    clf = grid.best_estimator_

    # Classification of the test samples using the fixed pipeline
    tst_map = codecs.open('test_samples/test_file_map.txt',
                          encoding='utf-8',
                          mode='r').readlines()
    for b in range(0, len(tst_map), 2):
        # File name of test file to check
        fl = tst_map[b].strip('\n\r')
        print fl
        data = []
        tst_data = []
        p3 = 0
        # Preparing Test Samples
        text = codecs.open('test_samples/' + fl + '.txt',
                           encoding='utf-8',
                           mode='r').readlines()
        text = ''.join(text)
        sentences = tokenize.sent_tokenize(text)
        for s in sentences:  # Extracting sentences
            s = re.sub('[^a-zA-Z0-9.?:!$\n]', ' ',
                       s)  # Remove special characters
            tst_data.append(s)
            p3 = p3 + 1
        data.extend(trn_data)
        data.extend(tst_data)
        if inp == 'a':
            out = codecs.open('output/inclusion_statements/svm/' + fl +
                              '_svm.txt',
                              encoding='utf-8',
                              mode='w')  # Output file
            out.write('\n Using SVM Classifier: \n\n')
            out.write('Total No. of Sentences in the Reference: ' + str(p3) +
                      '\n\n')
            out.write('The Inclusion Statements are as Follow: \n\n')
        elif inp == 'b':
            out = codecs.open('output/interventions/svm/' + fl + '_svm.txt',
                              encoding='utf-8',
                              mode='w')  # Output file
            out.write('\n Using SVM Classifier: \n\n')
            out.write('Total No. of Sentences in the Reference: ' + str(p3) +
                      '\n\n')
            out.write('The Interventions are as Follow: \n\n')
        elif inp == 'c':
            out = codecs.open('output/exclusion_statements/svm/' + fl +
                              '_svm.txt',
                              encoding='utf-8',
                              mode='w')  # Output file
            out.write('\n Using SVM Classifier: \n\n')
            out.write('Total No. of Sentences in the Reference: ' + str(p3) +
                      '\n\n')
            out.write('The Exclusion Statements are as Follow: \n\n')


#   Results
        nps = 0
        clf.fit(trn_data, trn_cat)
        predicted = clf.predict(tst_data)
        for i in range(0, len(predicted)):
            if predicted[i] == 0:
                nps = nps + 1
                #            print 'Relevant Sentence '+str(nps)
                #            print '\n'+data[p1+p2+i]+'\n'
                out.write('\n' + str(nps) + ")  " + tst_data[i] + '\n')
        print("Total No. of Positive Sentences: %d" % nps)
Ejemplo n.º 58
0
df.drop(df.index[630:680], inplace=True)
df.drop(df.index[680:750], inplace=True)
df.drop(df.index[750:800], inplace=True)
df.drop(df.index[800:850], inplace=True)
df.drop(df.index[850:880], inplace=True)
df.drop(df.index[880:900], inplace=True)
df.drop(df.index[900:911], inplace=True)

X_train = df['URL']
y_train = df['Category']

X_test = dt['URL']
y_test = dt['Category']

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(X_train, y_train)

from sklearn.model_selection import RandomizedSearchCV
n_iter_search = 5
parameters = {
    'vect_ngram_range': [(1, 1), (1, 2)],
    'tfidfuse_idf': (True, False),
    'clf_alpha': (1e-2, 1e-3)
}
gs_clf = RandomizedSearchCV(text_clf, parameters, n_iter=n_iter_search)
gs_clf = gs_clf.fit(X_train, y_train)

#read url file
cf = pd.read_csv('./urls.csv')
Ejemplo n.º 59
0
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
corpus = []

with open("sirgawain.txt") as f:
    for line in f:
        corpus.append(line.rstrip())

X = vectorizer.fit_transform(corpus)

print X.toarray()

print vectorizer.get_feature_names()
tweets = dataset['Text']
classes = dataset['Sentimento']

#Gerando o Data Frame
tweets_Dataframe = pd.DataFrame({'Text': tweets, 'Classificacao': classes})
#Analisar quantos tweets a base possui
print(len(tweets_Dataframe))

#Analisar quantos dados de cada classificação existem
print(tweets_Dataframe.Classificacao.value_counts())
#Exibindo em grafico de barras
fig = plt.figure(figsize=(8, 6))
tweets_Dataframe.groupby('Classificacao').count().plot.bar(ylim=0)
plt.show()
#tecnicas de normalização. Oversampling e undersampling
vectorizer = CountVectorizer(ngram_range=(1, 2))
bow = vectorizer.fit_transform(tweets)

#Exibindo o Bag of Words. Lembrar de pegar o vetor com menos dados [:50]
#bow_data_frame = pd.DataFrame(bow.A,columns = vectorizer.get_feature_names())

#normalização de ocorrências. frequência das palavras
tfidf_transformer = TfidfTransformer()
bow = tfidf_transformer.fit_transform(bow)

#Exibindo o Bag of Words após o TDIDF. Lembrar de pegar o vetor com menos dados [:50]
#bow_data_frame = pd.DataFrame(bow.A,columns = vectorizer.get_feature_names())

modelo = MultinomialNB()
modelo.fit(bow, classes)