def cal_product_title_tfidf():

    #PART I compute the tf-idf for product title
    print "\nBegins,compute the tf-idf for product title ..."


    print "\nStemming product_title..."
    AllSet['product_title'] = AllSet['product_title'].map(lambda x : stem_process(x))
    product_title = AllSet['product_title']

    print "\nGet the (product title vocabulary)-(search term) frequency matrix..."
    search_vect_tittle = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency
    search_vect_tittle.fit(product_title)#learn the vocabulary
    search_tittle_fq_matrix = search_vect_tittle.transform(search_term) #get the (product title vocabulary)-(search term) frequency matrix

    print "\nGet the (product title vocabulary)-(product_title) frequency matrix"
    title_vect = CountVectorizer(stop_words='english')
    title_vect.fit_transform(product_title)#learn the vocabulary
    title_fq_matrix = title_vect.transform(product_title) #get the (product title vocabulary)-(product_title) frequency matrix

    print "\nGet the idf matrix"
    tfidf_transformer = TfidfTransformer(norm="l2", smooth_idf=True)
    tfidf_transformer.fit(title_fq_matrix) # get idf for each vocabulary
    tf_idf_title_matrix = tfidf_transformer.transform(title_fq_matrix) #get the idf matrix

    print "\nCompute the result of tf-idf for product title ..."
    tf_idf_title_result = [] #compute the result of tf-idf for product title
    for index in range(tf_idf_title_matrix.shape[0]):
        tf_idf_title_result.append((np.multiply(tf_idf_title_matrix[index], search_tittle_fq_matrix[index].transpose()))[0, 0])

    pd.DataFrame({"id": AllSet['id'],"product_title_tfidf": tf_idf_title_result}).to_csv('product_title_tfidf.csv', index=False)

    return 0
def vectorize_data(quote_list, vectorizer = None, Tfidf = True, min_df = 1, 
                   ngram_range = (1,2), token_pattern = r'\b\w\w+\b'):
    '''
    Vectorizes given data using desired vectorizer object.
    
    Input:
        quote_list: list of data to vectorize
        vectorizer : CountVectorizer object (optional)
            A CountVectorizer object to use. If None,
            then create and fit a new CountVectorizer.
            Otherwise, re-fit the provided CountVectorizer
            using the provided data data
    
    Output:
        numpy array (dims: nreview, nwords)
            Bag-of-words representation for each quote.
    '''
    
    # if no vectorizer was passed, declare a vectorizer object
    if(vectorizer is None): 
        if(Tfidf == False):
            vectorizer = CountVectorizer(min_df = min_df, ngram_range = ngram_range, token_pattern = token_pattern)
        else:
            vectorizer = TfidfVectorizer(min_df = min_df, ngram_range = ngram_range, token_pattern = token_pattern)
    
    # build the vectorizer vocabulary
    vectorizer.fit(quote_list)

    # transform into bag of words
    X = vectorizer.transform(quote_list)
    
    return X.tocsc()
class ACMClassificator(BaseACMClassificator):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
        self.mlb = MultiLabelBinarizer()
        self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini",
                                                                     max_depth=None,
                                                                     min_samples_split=2,
                                                                     min_samples_leaf=1,
                                                                     min_weight_fraction_leaf=0.,
                                                                     max_features="auto",
                                                                     max_leaf_nodes=None,
                                                                     class_weight=None),
                                                 n_jobs=-1
                                                 )

    def _prepare_problems(self, problems):
        return self.vectorizer.transform([p.statement for p in problems])

    def fit(self, problems, tags):
        nltk.download('punkt', quiet=True)
        self.vectorizer.fit([p.statement for p in problems])
        mat = self._prepare_problems(problems)
        self.mlb = self.mlb.fit(tags)
        self.classificator.fit(mat.toarray(), self.mlb.transform(tags))

    def predict(self, problems):
        mat = self._prepare_problems(problems)
        predicted = self.classificator.predict(mat.toarray())
        return self.mlb.inverse_transform(predicted)
def get_vectorizer(article_texts, max_features=50000):
    vectorizer = CountVectorizer(ngram_range=(1,2), stop_words="english",
                                    min_df=2,
                                    token_pattern=r"(?u)95% confidence interval|95% CI|95% ci|[a-zA-Z0-9_*\-][a-zA-Z0-9_/*\-]+",
                                    binary=False, max_features=max_features)
    vectorizer.fit(article_texts)
    return vectorizer
def trainModel(test_data):
    predictions = dict()
    outcome_list=('DE', 'LT', 'HO', 'DS', 'CA', 'RI', 'OT')
    for o in outcome_list:
        info,outcome=loadData('Outcomes' + '/' + o +'.txt')
        #split data into training dataset      
        train, test, labels_train, labels_test = train_test_split(info, outcome, test_size=0.33)
        counter = CountVectorizer()
        counter.fit(train)
        
        #count the number of times each term appears in a document and transform each doc into a count vector
        counts_train = counter.transform(train)#transform the training data
        counts_test = counter.transform(test_data)#transform the new data

        #build a classifier on the training data        
        LR = LogisticRegression()     
        LR.fit(counts_train,labels_train)        
        #use the classifier to predict on new data
        predicted=LR.predict(counts_test)
        
        #determine prediction results
        if 1 in predicted:
            flag = 'yes'
        else:
            flag = 'no'
        predictions[o] = flag #store result of each outcome
    return predictions
Exemple #6
0
def main():
    global tweetdata
    for d in tweetdata.find({}, {'_id': 1, 'id': 1, 'text': 1}):
        res = mecab_analysis(unicodedata.normalize('NFKC', d['text']))
        for k in res.keys():
            if k == '形容詞':
                adjective_list = []
                for w in res[k]:
                    adjective_list.append(w)
                    freq[w] += 1
                tweetdata.update({'_id': d['_id']}, {'$push': {'adjective': {'$each': adjective_list}}})
            elif k == '動詞':
                verb_list = []
                for w in res[k]:
                    verb_list.append(w)
                    freq[w] += 1
                tweetdata.update({'_id': d['_id']}, {'$push': {'verb': {'$each': verb_list}}})
            elif k == '名詞':
                noun_list = []
                for w in res[k]:
                    noun_list.append(w)
                    freq[w] += 1
                tweetdata.update({'_id': d['_id']}, {'$push': {'noun': {'$each': noun_list}}})
        tweetdata.update({'_id': d['_id']}, {'$set': {'mecabed': True}})
    ret_all = get_mecabed_strings()
    tw_list_all = ret_all['tweet_list']
    c_vec = CountVectorizer(stop_words=[u"寿司"])
    c_vec.fit(tw_list_all)
    c_terms = c_vec.get_feature_names()
    transformed = c_vec.transform(tw_list_all)
    arg_ind = np.argsort(transformed.toarray())[0][:-50:-1]
    genexp = ((k, freq[k]) for k in sorted(freq, key=freq.get, reverse=True)[0:100])
    write_to_csv(genexp)
    for k, v in genexp:
        print(k + '\t\t\t' + str(v))
Exemple #7
0
def naive_bayes(x_value, y_value):
    X = x_value
    y = y_value

    #train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

    vect = CountVectorizer()
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)

    X_test_dtm = vect.transform(X_test)

    from sklearn.naive_bayes import MultinomialNB
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    
    print 'Accuracy: '
    print metrics.accuracy_score(y_test, y_pred_class)
    
    print 'Null Accuracy: '
    print y_test.value_counts().head(1) / len(y_test)
    
    print 'Confusion Matrix: '
    print metrics.confusion_matrix(y_test, y_pred_class)
def train_vectorizer(corpus, max_features=10000):
    """ Train the vectorizer """
    print "training the vectorizer..."
    vectorizer = CountVectorizer(decode_error='ignore', max_features=max_features)
    vectorizer.fit(corpus)
    print "ok"
    return vectorizer
Exemple #9
0
def vectorize_in_test(col_name):
    v = CountVectorizer(tokenizer=my_tokenizer, stop_words=None, strip_accents="unicode")
    vv = CountVectorizer(tokenizer=my_tokenizer, stop_words=None, strip_accents="unicode")
    v.fit(train_data[col_name])
    vv.fit(test_data[col_name])
    stop = [w for w in v.vocabulary_.keys() if w not in vv.vocabulary_.keys()]
    return stop
def fit(x, y, estimator, dataframe, params):
    vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True)
    vectorizer.fit(dataframe[x].values)
    fresh_estimator = clone(estimator)
    x_np, y_np, feature_names, selector = \
    select_features(
        df = dataframe,
        vectorizer=vectorizer,
        feature_col=x,
        label_col=y,
        select_method=None,
        continuous_col=None
    )
    estimator = RandomizedSearchCV(estimator, params, n_iter=60, cv=3, n_jobs=3, refit=True)
    estimator.fit(x_np, y_np)
    best_params = estimator.best_params_

    if method not in ['lr', 'svm']:
        print("Calibrating...")
        estimator = CalibratedClassifierCV(fresh_estimator.set_params(**best_params), 'isotonic', 3)
        estimator.fit(x_np, y_np)

    from sklearn.base import _pprint
    _pprint(estimator.get_params(deep=True), offset=2)
    return estimator, selector, vectorizer
def find_common_words(all_words, num_most_frequent_words):
    vectorizer = CountVectorizer(
        stop_words=None, # 'english',
        max_features=num_most_frequent_words,
        binary=True)
    vectorizer.fit(all_words)
    return (vectorizer.vocabulary_, vectorizer.get_feature_names())
def main():
    print 'Opening ZIP file'
    zin = zipfile.ZipFile(config.html_cleaned_zip, 'r')

    filenames = zin.namelist()
    filenames = filenames[0:10]
    filenames.sort()

    print 'Reading ZIP file'
    ordering = {n:i for i, n in enumerate(filenames)}
    #contents = [zin.open(n, 'r') for n in filenames]

    cv = CountVectorizer(stop_words=config.common_words,
                         input='file',
                         dtype=np.float32)
    print 'Learning vocabulary'
    cv.fit(izip(zin, filenames))
    vocabulary = cv.vocabulary_
    
    print 'Generating word vectors'
    docmat1 = cv.transform(izip(zin, filenames))

    print 'Generating TF-IDF word vectors'
    docmat2 = TfidfTransformer().fit_transform(docmat1)

    print 'Writing output'
    with open(config.html_config, 'w') as pf:
        pickle.dump((filenames, ordering, vocabulary), pf, pickle.HIGHEST_PROTOCOL)

    np.savez(config.doc_mat, plain=docmat1, tfidf=docmat2)
class punctuation_ngrams_fe(feature_extractor):
    def __init__(self, config_file):
        super(punctuation_ngrams_fe, self).__init__(config_file)
        self.token_pattern = u'[,;\.:?!¿¡]+'
        self.ngram_x = 2
        self.ngram_y = 2

    def train(self, authors):
        documents = [self.db.get_author(a)["corpus"] for a in authors]
        documents = utils.flatten(documents)
        self.ngram_vectorizer = \
            CountVectorizer(ngram_range=(self.ngram_x, self.ngram_y),\
                                         token_pattern=self.token_pattern,\
                                         analyzer='word')
        self.ngram_vectorizer.fit(documents)
        # use only normalized term frequencies
        self.transformer = TfidfTransformer(use_idf=False)

    def compute_features(self, author):
        freq = self.ngram_vectorizer.transform(author["corpus"])
        freq = freq.toarray().astype(int)
        # normalized ngram frequencies
        norm_freq = self.transformer.fit_transform(freq).toarray()
        # average normalized frequencies among all author documents
        norm_freq = np.divide(np.sum(norm_freq, axis=0),
                                 len(norm_freq))

        ngrams = self.ngram_vectorizer.get_feature_names()
        for id_ngram, (ngram, value) in enumerate(zip(ngrams, norm_freq)):
            author = self.db.set_feature(author,
                                         "Ngram::punct::" + ngram,
                                         value)

        return author
Exemple #14
0
def featTransform(sents_train, sents_test):
    cv = CountVectorizer()
    cv.fit(sents_train)
    print(cv.get_params())
    features_train = cv.transform(sents_train)
    features_test = cv.transform(sents_test)
    return features_train, features_test, cv
Exemple #15
0
def company_search(company):
    CONSUMER_KEY = 'fH4YFq25oK61JwakuaJ5g'
    CONSUMER_SECRET = 'S8v2bm0y8jPy3oIsJl8QdZtx6BnDtbkiN2ANK65ZLM'
    OAUTH_TOKEN = '21964998-aeEYdcIHsmaKMrjBM4wqMqpFLlJ8Npy002DepKYsa'
    OAUTH_TOKEN_SECRET = 'fZa21ALNIBiWetskCIuaywLro05EwgG2VjgaczpbRawjB'
    
    auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                               CONSUMER_KEY, CONSUMER_SECRET)
    
    twitter_api = twitter.Twitter(auth=auth)
    search_results = twitter_api.search.tweets(q=company,count=10000)
    date_status = [(datetime.datetime.strptime(re.sub('\+0000 ','',status['created_at']),'%a %b %d %H:%M:%S %Y').date(),status['text']) for status in search_results['statuses']]
    date_string_dict = {}
    for date,text in date_status:
        if date in date_string_dict:
            date_string_dict[date] = date_string_dict[date]+text
        else:
            date_string_dict[date]=text
    vectorizer = CountVectorizer(min_df=0)
    vectorizer.fit(date_string_dict.values())
    bag_matrix = vectorizer.transform(date_string_dict.values())
    bag_matrix=sparse.csc_matrix(bag_matrix)
    #type(bag_matrix)
    #bag_matrix.toarray()
    return date_string_dict,bag_matrix
def build_classifier(df_curated, df_all):
    vec = CountVectorizer(tokenizer=pre_process)
    vec.fit(df_all.tweet)
    bagofwords = vec.transform(df_curated.tweet)
    bagofwords = bagofwords.toarray()
    clf = MultinomialNB().fit(bagofwords, df_curated['class'])
    return vec, clf
def prep_train_evaluate(docs_train, docs_test, labs_train, labs_test, **kwargs):
  '''func to prep text, extract features, train model, predict, evaluate'''

  # instantiate vectorizer + classifier 
  vectorizer = CountVectorizer(token_pattern=r'\b[a-zA-Z0-9_<>]{1,}\b', 
                               **kwargs)
  classifier = LogisticRegression(solver='liblinear')

  # construct feature matrices for train and test sets 
  vectorizer.fit(docs_train)
  X_train = vectorizer.transform(docs_train)
  X_test = vectorizer.transform(docs_test)

  # fit/train classifier using train features and labels 
  classifier.fit(X_train, labs_train)

  # generate test set model predictions from test matrix 
  preds_test = classifier.predict(X_test)

  # measure performance using simple accuracy (proportion correct) 
  accuracy = accuracy_score(labs_test, preds_test)

  # print lil message showing param settings + performance 
  print(f'  >> test set accuracy: {accuracy:.3f}\n({kwargs})\n')

  # return classifier, vectorizer, predictions, and score for inspection 
  return {'clf': classifier, 'vect': vectorizer, 
          'preds': preds_test, 'acc': accuracy}
Exemple #18
0
def test_countvectorizer_custom_vocabulary():
    what_we_like = ["pizza", "beer"]
    vect = CountVectorizer(vocabulary=what_we_like)
    vect.fit(JUNK_FOOD_DOCS)
    assert_equal(set(vect.vocabulary), set(what_we_like))
    X = vect.transform(JUNK_FOOD_DOCS)
    assert_equal(X.shape[1], len(what_we_like))
def preprocesar(labeled, unlabeled, dims, stop_words=None):
    """preprocesar."""

    instances = []
    labels = []
    for v_l in labeled.values():
        instances += v_l['X']
        labels += v_l['y']

    if unlabeled is not None:
        for v_ul in unlabeled.values():
            instances += v_ul['X']

    x_cv = CountVectorizer(max_features=dims, ngram_range=(1, 2), binary=True, stop_words=stop_words)
    x_cv.fit(instances)

    y_cv = CountVectorizer()
    y_cv.fit(labels)

    print "\nEtiquetas:"

    for etiqueta, valor in y_cv.vocabulary_.items():
        print "\tEtiqueta: %s - Valor: %d" % (etiqueta, valor)
    print ""

    for d_l in labeled:
        labeled[d_l]['X'] = x_cv.transform(labeled[d_l]['X'])
        labeled[d_l]['y'] = y_cv.transform(labeled[d_l]['y'])

    if unlabeled is not None:
        for d_ul in unlabeled:
            unlabeled[d_ul]['X'] = x_cv.transform(unlabeled[d_ul]['X'])

    return labeled, unlabeled
Exemple #20
0
 def fit(self, X, y, min_df=0.005,max_df=0.8, *args, **kwargs):
     # Train the model using the training sets
     vect =  CountVectorizer(min_df=self.min_df, max_df=self.max_df, max_features=4500, ngram_range=(2,2))
     vect.fit(X)
     self.bivect  = CountVectorizer(ngram_range=(2,2), vocabulary=vect.vocabulary_)
     super(TlinReg, self).fit(vect.transform(X), y, *args, **kwargs)
     return self
def createSparsMatrix(featureDict, tupledTweets, flag):

    # print features
    tples = tupledTweets

    m = len(tples)
    tweets = []

    yValues = np.empty((m,))
    for i, line in enumerate(tples):
        yValues[i,] = int(line[0] == "true")
        tweets.append(line[1])

    vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 3), max_features=10000)
    if flag == 1:
        features = read_words(featureDict)
        vectorizer.fit(features)
    else:
        vectorizer.fit(featureDict)

    # print vectorizer.get_feature_names()
    xValues = vectorizer.transform(tweets)
    # print vectorizer.vocabulary_.get('high')
    # print xValues.toarray()
    return xValues, yValues, vectorizer
class ACMClassificator(BaseACMClassificator):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
        self.classificator = RandomForestClassifier(n_estimators=256,
                                                    criterion="gini",
                                                    max_depth=None,
                                                    min_samples_split=2,
                                                    min_samples_leaf=1,
                                                    min_weight_fraction_leaf=0.,
                                                    max_features="auto",
                                                    max_leaf_nodes=None,
                                                    bootstrap=True,
                                                    oob_score=False,
                                                    n_jobs=-1,
                                                    class_weight=None)

    def _prepare_problems(self, problems):
        return self.vectorizer.transform([p.statement for p in problems])

    def fit(self, problems, tags):
        nltk.download('punkt', quiet=True)
        self.vectorizer.fit([p.statement for p in problems])
        mat = self._prepare_problems(problems)
        self.classificator.fit(mat.toarray(), tags)

    def predict(self, problems):
        mat = self._prepare_problems(problems)
        return self.classificator.predict(mat.toarray())
class lang_detector():

    def __init__(self,classifier=MultinomialNB()):
        self.classifier = classifier
        self.vectorizer = CountVectorizer(ngram_range=(1,2),max_features=1000,preprocessor=self._remove_noise)

    # need to remove #hastage, @mention and links
    
    def _remove_noise(self, document):
        noise_pattern = re.compile("|".join(["http\S+","\@\w+","\#\w+"]))
        clean_text = re.sub(noise_pattern,"",document)
        return clean_text

    def features(self,X):
        return self.vectorizer.transform(X)
    
    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)

    def predict(self,x):
        return self.classifier.predict(self.features([x]))

    def score(self,X,y):
        return self.classifier.score(self.features(X),y)
def main(train_file, test_file):
  #print "loading data.."
  csv.field_size_limit(1310720)
  trainreader = csv.reader (open( '/home/kiran/kdd/train.csv' ))
  projectid, traindata_old = zip (*trainreader)  

  testreader = csv.reader (open ('/home/kiran/kdd/test.csv'))
  projectid, testdata_old = zip (*testreader)


  # remove stopwords
  traindata = []
  testdata = []
  for observation in traindata_old:
      traindata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False))
  for observation in testdata_old:
      testdata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False))

  tfv = CountVectorizer (binary=1,ngram_range=(1, 1))
  X_all = traindata + testdata
  lentrain = len(traindata)
  tfv.fit(X_all)
  X_all = tfv.transform(X_all)
  X = X_all[:lentrain]
  X_test = X_all[lentrain:]
  scipy.io.mmwrite ('x_train_bin_1gram.mtx', X, field = 'real')
  scipy.io.mmwrite ('x_test_bin_1gram.mtx', X_test, field = 'real')
  myCols = tfv.get_feature_names ()
  myCols = DataFrame (myCols)
  myCols.to_csv ('bin_1gram.csv', index=False)
Exemple #25
0
 def fit(self, X, y, min_df=0.005,max_df=0.8, *args, **kwargs):
     # Train the model using the training sets
     vect =  CountVectorizer(stop_words='english', min_df=self.min_df, max_df=self.max_df, max_features=4500, ngram_range=(2,2))
     vect.fit([e['text'] for e in X])
     self.vocabulary_ = vect.vocabulary_
     super(TlinReg, self).fit(vect.transform(e['text'] for e in X), y, *args, **kwargs)
     return self
def cal_product_description_tfidf():
    #PART II compute the tf-idf for product description
    print "\nBegins,compute the tf-idf for product description ..."
    product_description_data = pd.read_csv('product_descriptions.csv')

    print "\nMerge the product description into database..."
    AllSet = pd.merge( AllSet , product_description_data, how='left', on='product_uid')

    print "\nStemming the product description ..."
    AllSet['product_description'] = AllSet['product_description'].map(lambda x: stem_process(x))
    product_description=AllSet['product_description']

    print "\nGet the (product description vocabulary)-(search term) frequency matrix..."
    search_vect_descrip = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency
    search_vect_descrip.fit(product_description)#learn the vocabulary
    search_descrip_fq_matrix = search_vect_descrip.transform(search_term) #get the (product description vocabulary)-(search term) frequency matrix

    print "\nGet the (product description vocabulary)-(product_description) frequency matrix..."
    description_vect = CountVectorizer(stop_words ='english')
    description_vect.fit_transform(product_description)#learn the vocabulary
    description_fq_matrix=description_vect.transform(product_description) #get the (product discription vocabulary)-(product_description) frequency matrix

    print "\nGet the idf matrix..."
    tfidf_transformer = TfidfTransformer(norm="l2",smooth_idf=True)
    tfidf_transformer.fit(description_fq_matrix) # get idf for each vocabulary
    tf_idf_descrip_matrix  = tfidf_transformer.transform(description_fq_matrix) #get the idf matrix


    print "\nCompute the result of tf-idf for product description ..."
    tf_idf_descrip_result=[]#compute the result of tf-idf for product title
    for index in range(tf_idf_descrip_matrix.shape[0]):
        tf_idf_descrip_result.append((np.multiply(tf_idf_descrip_matrix[index], search_descrip_fq_matrix[index].transpose()))[0, 0])

    pd.DataFrame({"id":AllSet['id'],"product_description_tfidf": tf_idf_descrip_result}).to_csv('product_description_tfidf.csv', index=False)
class Featurizer(object):
    def __init__(self):
        self.sentiment_analyzer = Sentiment('data/AFINN-111.txt')
        self.bow_vectorizer = None
        self.bow_analyzer = None

    def bag_of_words(self, body):
        return self.bow_vectorizer.transform([body]).toarray()

    def text_features(self, comment):
        num_chars = len(comment.get("body"))
        num_links = count_links(comment.get("body"))

        simple_tokens = comment.get("body").split(' ')
        num_words = 0
        avg_word_length = 0
        for token in simple_tokens:
            num_words += 1
            avg_word_length += len(token)
        avg_word_length = float(avg_word_length) / float(num_words)

        sentiment = self.sentiment_analyzer.analyze(
            self.bow_analyzer(comment.get("body")))

        score = comment.get("score")

        return [num_chars, num_links, num_words, num_words, 
                avg_word_length, sentiment]

    def transform_comment(self, comment):
        return numpy.hstack((
            numpy.array([self.text_features(comment)], 
                        dtype='float_'),
            self.bag_of_words(comment.get("body"))))

    def score_comment(self, comment):
        return comment.get("score")

    def transform(self, comments):
        """ Returns a Nx(D+1) numpy matrix of features. The first D columns
        correspond to features, where the final column corresponds to the
        scores of each comment"""

        # if it's a single instance, return an array
        if isinstance(comments, dict):
            return transform_comment(comments)

        # http://scikit-learn.org/stable/modules/feature_extraction.html
        self.bow_vectorizer = CountVectorizer(min_df=1)
        self.bow_vectorizer.fit([c.get("body") for c in comments])
        self.bow_analyzer = self.bow_vectorizer.build_analyzer()

        def features_and_label(comment):
            return numpy.hstack((
                self.transform_comment(comment),
                numpy.array([[self.score_comment(comment)]], 
                            dtype='float_')))

        return numpy.vstack([features_and_label(c) 
                             for c in comments])
class BiGramPreProcessor(PreProcessor):
    def __init__(self, url_list=None, vocab=None):
        self.stemmer = RSLPStemmer()
        self.vectorizer = CountVectorizer(preprocessor=self.stemmer.stem, tokenizer=tokenizer_with_numeric,
                                          ngram_range=(1,2))
        if url_list is not None:
            self.fit_vocab(url_list)
        else:
            self.vectorizer.vocabulary_ = vocab
        self.vocab_size = len(self.vectorizer.vocabulary_)

    def fit_vocab(self, url_list):
        text_generator = url2text_generator(url_list)
        self.vectorizer.fit(text_generator)

    def url_to_bow(self, url):
        print url
        text_generator = url2text_generator([url])
        sparse_matrix = self.vectorizer.transform(text_generator)
        return [(sparse_matrix.indices[i], value) for i, value in enumerate(sparse_matrix.data)]

    def idf(self, term_id):
        return None

    def dict_from_idf(self, idf_path):
        return None
    def generatePredictingModel(data):
        """
            Build the prediction model (based on the data set we have) in order to be able to predict the category
            of a new video from the user input
            Return a classifier able to predict the category of a video based on its title and description.
        """
        try:
            # Intitialize a timer to compute the time to build the model
            start = time.time()

            # Split into train-test data set
            X = data[[x for x in data.columns if x in ('title', 'description')]]
            Y = data[[x for x in data.columns if x in ('video_category_id')]]
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.80, random_state = 10)

            # Build the 2 text corpus
            corpus_title = X_train['title'].values.tolist()
            corpus_description = X_train['description'].values.tolist()

            # initializes the 2 vectorizers.
            count_vectorizer_title = CountVectorizer()
            count_vectorizer_description = CountVectorizer()

            # learn the 2 vocabulary dictionary
            count_vectorizer_title.fit(corpus_title)
            count_vectorizer_description.fit(corpus_description)

            # Build the sparse matrices
            X_train_count_title = count_vectorizer_title.transform(X_train['title'])
            X_train_count_description = count_vectorizer_description.transform(X_train['description'])
            X_test_count_title = count_vectorizer_title.transform(X_test['title'])
            X_test_count_description = count_vectorizer_description.transform(X_test['description'])

            # Set and train the models (for title and description features)
            model_count_title = BernoulliNB()
            model_count_description = BernoulliNB()
            model_count_title.fit(X_train_count_title, Y_train['video_category_id'])
            model_count_description.fit(X_train_count_description, Y_train['video_category_id'])

            # Merge the title and description predictions and build a new prediction based on these 2 predictions combined
            new_df_train = pd.DataFrame()
            new_df_train['title_prediction'] = model_count_title.predict(X_train_count_title)
            new_df_train['description_prediction'] = model_count_description.predict(X_train_count_description)
            new_df_test = pd.DataFrame()
            new_df_test['title_prediction'] = model_count_title.predict(X_test_count_title)
            new_df_test['description_prediction'] = model_count_description.predict(X_test_count_description)
            tree = DecisionTreeClassifier()
            tree.fit(new_df_train, Y_train)

            end = time.time()
            execution_time = end - start

            print "Time to build this incredibly amazing model, only : {} seconds!!!!!!".format(execution_time)
            time.sleep(3)

            return tree, model_count_title, model_count_description,count_vectorizer_title,count_vectorizer_description

        except:
            raise VideoAnalysisException(" Error while creation of predictive model ")
 def __init__(self, subset, n_character_deleted=1):
     assert subset in ['train', 'valid', 'test']
     twenty_news_groups = fetch_20newsgroups(subset=subset)
     count_vect = CountVectorizer()
     count_vect.fit(twenty_news_groups.data)
     self.words = count_vect.vocabulary_.keys()
     random.shuffle(self.words)
     self.idx = 0
Exemple #31
0
    "imdb": "../../datasets/opiniones/imdb_labelledes.csv"
}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)

df = pd.concat(df_list)
print(df.head())

#1era estrategia vamos a crear un copntador de palabras

frases = ["A Victor le gusta linux", "Victor no no le gusta Tortoise"]
etiquetas = [1, 0]
#countvectorizer para vectorizar operaciones
print(frases)
vectorizar = CountVectorizer(lowercase=False)
vectorizar.fit(frases)
print(vectorizar.vocabulary_)

feature_vector = vectorizar.transform(frases).toarray()
print(feature_vector)

X = feature_vector
y = etiquetas

print(X)
print(y)
Exemple #32
0
    def fit(self,
            corpus,
            max_df_frac=0.90,
            min_df_frac=0.000025,
            is_featurizer_for_test=False):

        logging.info('Usage at beginning of featurizer fit: %s',
                     resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e6)

        if is_featurizer_for_test:
            paper_ids_for_training = corpus.train_ids + corpus.valid_ids
        else:
            paper_ids_for_training = corpus.train_ids

        # Fitting authors and venues
        logging.info('Fitting authors and venues')
        author_counts = collections.Counter()
        venue_counts = collections.Counter()
        keyphrase_counts = collections.Counter()
        for doc_id in tqdm.tqdm(paper_ids_for_training):
            doc = corpus[doc_id]
            author_counts.update(doc.authors)
            venue_counts.update([doc.venue])
            keyphrase_counts.update(doc.key_phrases)

        c = 1
        for author, count in author_counts.items():
            if count >= self.min_author_papers:
                self.author_to_index[author] = c
                c += 1

        c = 1
        for venue, count in venue_counts.items():
            if count >= self.min_venue_papers:
                self.venue_to_index[venue] = c
                c += 1

        c = 1
        for keyphrase, count in keyphrase_counts.items():
            if count >= self.min_keyphrase_papers:
                self.keyphrase_to_index[keyphrase] = c
                c += 1

        # Step 1: filter out some words and make a vocab
        if self.use_pretrained:
            vocab_file = dp.vocab_for_corpus('shared')
            with open(vocab_file, 'r') as f:
                vocab = f.read().split()
        else:
            logging.info('Cleaning text.')
            all_docs_text = [
                ' '.join((_clean(corpus[doc_id].title),
                          _clean(corpus[doc_id].abstract)))
                for doc_id in tqdm.tqdm(paper_ids_for_training)
            ]

            logging.info('Fitting vectorizer...')
            if self.max_features is not None:
                count_vectorizer = CountVectorizer(
                    max_df=max_df_frac,
                    max_features=self.max_features,
                    stop_words=self.STOPWORDS)
            else:
                count_vectorizer = CountVectorizer(max_df=max_df_frac,
                                                   min_df=min_df_frac,
                                                   stop_words=self.STOPWORDS)
            count_vectorizer.fit(tqdm.tqdm(all_docs_text))
            vocab = count_vectorizer.vocabulary_

        logging.info('Usage after word count: %s',
                     resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e6)

        # Step 4: Initialize mapper from word to index
        self.word_indexer = FeatureIndexer(vocab=vocab,
                                           use_pretrained=self.use_pretrained)
        self.n_features = 1 + len(self.word_indexer.word_to_index)

        logging.info('Usage after word_indexer: %s',
                     resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e6)
        logging.info('Usage at end of fit: %s',
                     resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e6)
        logging.info('Total words %d ' % len(self.word_indexer.word_to_index))
        logging.info('Total authors %d ' % self.n_authors)
        logging.info('Total venues %d ' % self.n_venues)
        logging.info('Total keyphrases %d ' % self.n_keyphrases)
Exemple #33
0
text_1=read_text('LOTR1.txt')
text_2=read_text('LOTR2.txt')
text_3=read_text('LOTR3.txt')
text = [text_1,text_2,text_3]

lotrDF = DataFrame()

# basic text level features
lotrDF['text'] = text
lotrDF['char_count'] = lotrDF['text'].apply(len)
lotrDF['word_count']=lotrDF['text'].apply(lambda x: len(x.split()))
lotrDF['word_density']=lotrDF['char_count']/(lotrDF['word_count']+1)

# count vectorizer
count_vec = CountVectorizer(stop_words='english', analyzer='word')
count_fit = count_vec.fit(lotrDF['text'])
vector_count=count_fit.transform(lotrDF['text'])
count_feat=count_vec.get_feature_names() 
count_set = set(count_feat)
count_freqs=zip(count_feat,vector_count.sum(axis=0).tolist()[0])

fellowship_count_vec = CountVectorizer(stop_words='english', analyzer='word')
fellowship_vector = fellowship_count_vec.fit_transform([text_1])
fellowship_feat = fellowship_count_vec.get_feature_names()
fellowship_set = set(fellowship_feat)

towers_count_vec = CountVectorizer(stop_words='english', analyzer='word')
towers_vector = towers_count_vec.fit_transform([text_2])
towers_feat = towers_count_vec.get_feature_names()
towers_set = set(towers_feat)
Exemple #34
0
from sklearn import preprocessing
from sklearn import metrics
import pandas as pd
import joblib

df = pd.read_csv('Movie_Metadata_Sentiments.csv')
# Subset only emotions required to get overall emotion detected from the text content
sub_df = df[['anger', 'joy', 'fear', 'sadness']]
# Label the movie with the highest count of emotions
df['Max'] = sub_df.idxmax(axis=1)
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,
                     stop_words='english',
                     ngram_range=(1, 1),
                     tokenizer=token.tokenize)
cv = cv.fit(df['Text_Content'])
text_counts = cv.transform(df['Text_Content'])
# Save the vectorizer
joblib.dump(cv, "vectorizer.pkl")

X_train, X_test, y_train, y_test = train_test_split(text_counts,
                                                    df['Max'],
                                                    test_size=0.2,
                                                    random_state=1)

print(X_train.shape)
le = preprocessing.LabelEncoder()
le.fit(y_train)
print(le.classes_)
y_train = le.transform(y_train)
y_test = le.transform(y_test)
Exemple #35
0
train_x = train[['creativeSize']]  #刚才没处理的连续型数据
test_x = test[['creativeSize']]

for feature in one_hot_feature:
    enc.fit(data[feature].values.reshape(-1, 1))
    train_a = enc.transform(train[feature].values.reshape(-1, 1))
    test_a = enc.transform(test[feature].values.reshape(-1, 1))
    train_x = sparse.hstack(
        (train_x,
         train_a))  #scipy.sparse 稀疏矩阵 sparse.hstack横向合并train_x和train_a
    test_x = sparse.hstack((test_x, test_a))
print('one-hot prepared !')

cv = CountVectorizer()
for feature in vector_feature:
    cv.fit(data[feature])
    train_a = cv.transform(train[feature])
    test_a = cv.transform(test[feature])
    train_x = sparse.hstack((train_x, train_a))
    test_x = sparse.hstack((test_x, test_a))
print('cv prepared !')


def LGB_test(train_x, train_y, test_x, test_y):
    print("LGB test")
    clf = lgb.LGBMClassifier(boosting_type='gbdt',
                             num_leaves=31,
                             reg_alpha=0.0,
                             reg_lambda=1,
                             max_depth=-1,
                             n_estimators=1000,
Exemple #36
0
def svm():
    train = load_model('model_rf/train_bow1_2.pkl')
    if train is None:
        train = load_data('datavn/train')

    vectorizer = load_model('model_rf/vectorizer_bow1_2.pkl')
    if vectorizer == None:
        # vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_df=0.7, min_df=2, max_features=1000)
        vectorizer = CountVectorizer(ngram_range=(1, 2),
                                     max_df=0.7,
                                     min_df=2,
                                     max_features=1000)
    test = load_model('model/test1_2.pkl')
    if test is None:
        test = load_data('datavn/test')

    train_text = train["question"].values
    test_text = test["question"].values

    vectorizer.fit(train_text)
    X_train = vectorizer.transform(train_text)
    joblib.dump(vectorizer, 'model_rf/vectorizer_bow1_2.pkl')
    X_train = X_train.toarray()
    y_train = train["label1"]
    y_train2 = train["label2"]

    X_test = vectorizer.transform(test_text)
    X_test = X_test.toarray()
    y_test = test["label1"]
    y_test2 = test["label2"]
    # joblib.dump(vectorizer, 'model/vectorizer2.pkl')
    print "---------------------------"
    print "Training"
    print "---------------------------"
    # iterate over classifiers
    clf = load_model('model/bow1_2.pkl')
    if clf is None:
        t0 = time.time()
        clf = RandomForestClassifier(n_estimators=100)
        clf.fit(X_train, y_train)
        joblib.dump(clf, 'model_rf/bow1_2.pkl')
        print " %s - Training completed %s" % (datetime.datetime.now(),
                                               time_diff_str(t0, time.time()))
    t1 = time.time()
    y_pred = clf.predict(X_test)
    print " %s - Converting completed %s" % (datetime.datetime.now(),
                                             time_diff_str(t1, time.time()))
    print " accuracy: %0.3f" % accuracy_score(y_test, y_pred)
    print " f1 accuracy: %0.3f" % f1_score(y_test, y_pred, average='weighted')
    print "confuse matrix: \n", confusion_matrix(
        y_test, y_pred, labels=["ABBR", "DESC", "ENTY", "HUM", "LOC", "NUM"])

    print "-----------------------"
    print "fine grained category"
    print "-----------------------"
    clf2 = load_model('model_rf/bow_fine1_2.pkl')
    if clf2 is None:
        t2 = time.time()
        clf2 = RandomForestClassifier(n_estimators=100)
        clf2.fit(X_train, y_train2)
        joblib.dump(clf2, 'model/bow_fine1_2.pkl')
        print " %s - Training for fine grained category completed %s" % (
            datetime.datetime.now(), time_diff_str(t2, time.time()))
    t3 = time.time()
    y_pred2 = clf2.predict(X_test)
    print " %s - Converting completed %s" % (datetime.datetime.now(),
                                             time_diff_str(t3, time.time()))
    print " accuracy for fine grained category: %0.3f\n" % accuracy_score(
        y_test2, y_pred2)
    print " f1 accuracy: %0.3f" % f1_score(
        y_test2, y_pred2, average='weighted')
Exemple #37
0
from sklearn.linear_model import ElasticNet




# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

TokenBasic ='\\S+(?=\\s+)'

# Fill missing values in df.Position_Extra

vec_alphanumeric = CountVectorizer(token_pattern = TOKENS_ALPHANUMERIC,stop_words='english')

# Fit to the data
vec_alphanumeric.fit(Jc.trans)


print(msg.format(len(vec_alphanumeric.get_feature_names())))
print(vec_alphanumeric.get_feature_names()[:70])


# Split out only the text data
X_train, X_test, y_train, y_test = train_test_split(Jc.trans,
                                                    Jc.readRatePercent, 
                                                    random_state=42)


('tfidf', TfidfVectorizer())

('vec', CountVectorizer(token_pattern = TOKENS_ALPHANUMERIC,stop_words='english',
# In[10]:

train['question1'] = train['question1'].fillna("dhainchu")
train['question2'] = train['question2'].fillna("dhainchu")

# In[11]:

print("Creating the vocabulary of words occurred more than",
      MIN_WORD_OCCURRENCE)
all_questions = pd.Series(train['question1'].tolist() +
                          train['question2'].tolist()).unique()
cv = CountVectorizer(lowercase=False,
                     token_pattern="\S+",
                     min_df=MIN_WORD_OCCURRENCE)
cv.fit(all_questions)
top_words = set(cv.vocabulary_.keys())
top_words.add(REPLACE_WORD)

# In[12]:

embeddings_index = get_embedding()

# In[13]:

print("Words are not found in the embedding:",
      top_words - embeddings_index.keys())
top_words = embeddings_index.keys()

# In[14]:
            print('-----finished corpus tokenization-----')
            # Load the document you wish to summarize
            title = 'American Missouri River Dakota Access Pipeline Fort Yates Standing Rock America Bakkan Sioux Youth Army Corps Engineer North Obama Trump Native DAPL Radio Energy Transfer Gonacon'
            count = 0
            for ele in raw:
                document = ele['Sentences']
                cleaned_document = clean_document(document)
                doc = remove_stop_words(cleaned_document)

                # Merge corpus data and new document data
                data = [' '.join(document) for document in data]
                train_data = set(data + [doc])

                # Fit and Transform the term frequencies into a vector
                count_vect = CountVectorizer()
                count_vect = count_vect.fit(train_data)
                freq_term_matrix = count_vect.transform(train_data)
                feature_names = count_vect.get_feature_names()

                # Fit and Transform the TfidfTransformer
                tfidf = TfidfTransformer(norm="l2")
                tfidf.fit(freq_term_matrix)

                # Get the dense tf-idf matrix for the document
                story_freq_term_matrix = count_vect.transform([doc])
                story_tfidf_matrix = tfidf.transform(story_freq_term_matrix)
                story_dense = story_tfidf_matrix.todense()
                doc_matrix = story_dense.tolist()[0]

                # Get Top Ranking Sentences and join them as a summary
                top_sents = rank_sentences(doc, doc_matrix, feature_names,top_n=1)
Exemple #40
0
from sklearn.feature_extraction.text import CountVectorizer

# Note we're doing "CountVectorizer" here and not TfidfVectorizer. Hmm...
word_features = CountVectorizer(
    strip_accents="unicode",
    lowercase=True,
    ngram_range=(1, 1),
)

# How does it take a whole paragraph and turn it into words?
text_to_words = word_features.build_analyzer()
# text_to_words is a function (str) -> List[str]
assert text_to_words("Hello world!") == ["hello", "world"]

# Learn columns from training data (again)
word_features.fit(ex_train)
# Translate our list of texts -> matrices of counts
X_train = word_features.transform(ex_train)
X_vali = word_features.transform(ex_vali)
X_test = word_features.transform(ex_test)

print(X_train.shape, X_vali.shape, X_test.shape)

#%% Accumulate results here; to be box-plotted.
results: Dict[str, List[float]] = {}

#%% try sklearn MultinomialNB:

## SKLearn has it's own Multinomial Naive Bayes,
#  and it uses the alpha / additive smoothing to deal with zeros!
from sklearn.naive_bayes import MultinomialNB
    n = data[i].nunique()
    if n > 5:
        print(i)
        data = feature_count(data, [i])  #构造交叉特征对应的记数特征
    else:
        print(i, ':', n)
#%%
# user_tags CountVectorizer
train_new = pd.DataFrame()
test_new = pd.DataFrame()
train = data[:train.shape[0]]
test = data[train.shape[0]:]
train_y = train['click']

cntv = CountVectorizer()
cntv.fit(train['user_tags'])
train_a = cntv.transform(train['user_tags'])
test_a = cntv.transform(test['user_tags'])
train_new = sparse.hstack(
    (train_new, train_a), 'csr'
)  #hstack : 将矩阵按照列进行拼接,对应的列数必须相等,hstack(blocks, format=None, dtype=None)
test_new = sparse.hstack((test_new, test_a), 'csr')
SKB = SelectPercentile(chi2, percentile=95).fit(
    train_new,
    train_y)  #区别:SelectKBest选择排名排在前n个的变量 SelectPercentile 选择排名排在前n%的变量
train_new = SKB.transform(train_new)
test_new = SKB.transform(test_new)
'''
在稀疏矩阵存储格式中:
# - COO 格式在构建矩阵时比较高效
# - CSC 和 CSR 格式在乘法计算时比较高效
Exemple #42
0
train_data, test_data, train_labels, test_labels = train_test_split(
    all_tweets, labels, test_size=0.2, random_state=1)

print(len(train_data))
print(len(test_data))

# -------------------------------------------------
# Transform tweets into count vectors
from sklearn.feature_extraction.text import CountVectorizer

counter = CountVectorizer()

# Teach the counter the vocabulary

counter.fit(train_data)

# Transform into count vectors

train_counts = counter.transform(train_data)
test_counts = counter.transform(test_data)

print(train_counts[3])
print(test_counts[3])

# ------------------------------------
# Train and test the Classifier

from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
Exemple #43
0
simple_train = [
    'call you tonight', 'Call me a cab', 'please call me... PLEASE!'
]

# From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):
#
# > Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect **numerical feature vectors with a fixed size** rather than the **raw text documents with variable length**.
#
# We will use [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) to "convert text into a matrix of token counts":

# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(simple_train)

# examine the fitted vocabulary
vect.get_feature_names()

# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm

# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

# From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):
Exemple #44
0
def mlprice(request):
    nome = ''
    result = 'Here goes the price'
    productname = 'Example: Spiderman Figure Action Marvel #123'
    if request.method == 'POST':

        nome = request.POST['nome']  #get the url
        re = requests.get(nome)  #make a requisition to page
        soup = BeautifulSoup(re.text, 'html.parser')
        needed = [
            '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ','
        ]  #here is just an array to extract numbers and ',' from the requisition
        test_texts = [
            value for element in soup.find_all(class_=True)
            for value in element["class"]
        ]  #here we're getting all the classes name from the site structure

        possible_texts = [
            "game_purchase_price", "price", "catalog-detail-price-value",
            "preco_desconto", "preco", "preco_desconto_avista-cm",
            'css-ovezyj', "currency-value", "best-price"
        ]  #possible price classes extracted from some websites
        negative_texts = [
            "container", "pop_up", "menu", "hr", "nav-menu-item"
        ]  #menu, popups, etc.
        training_texts = possible_texts + negative_texts
        training_labels = ["positive"] * len(
            negative_texts) + ["negative"] * len(possible_texts)
        vectorizer = CountVectorizer()
        vectorizer.fit(training_texts)
        training_vectors = vectorizer.transform(training_texts)
        testing_vectors = vectorizer.transform(test_texts)
        classifier = tree.DecisionTreeClassifier()
        classifier.fit(training_vectors, training_labels)
        predictions = classifier.predict(testing_vectors)
        c = 0  #counter
        valuesInsideFoundit = []

        for i in predictions:  #here's are passing throung the predictions
            if i == "positive":  #if it's possible to be a price
                foundit = soup.find(
                    class_=test_texts[c]
                )  #we will get the text inside that class from test_text in the index of the variable 'c'
                valuesInsideFoundit.append(
                    foundit.text
                )  #and we will append that value that we have just find
            c += 1  #counter increment

        firstValuesInsideIt = []
        #↓ Here we are filtering some values in text because in some pages we will find a lot of prices ou symbols that we have in our 'dictonary'(variable 'needed'), and probably
        #will see values like: ["R", "R", "$", "1", "3", "R", "$"].
        #So, we need to filter it to get an expected value like:"R$123,99".

        for k in filter(
                None, valuesInsideFoundit
        ):  #passing through the values without passing through empty indexes
            cc = 0  #counter
            for y in list(
                    k):  #passing through any index inside the filtered list.
                if y in needed or y == "R" and list(
                        k
                )[cc + 1] == "$" or y == "$" and list(
                        k
                )[cc -
                  1] == "R":  #if y is inside needed variable or y is "R" and k in the next index of cc is "$"
                    #or y is "$" and k in the last index of cc is "R" it's probably a value in Reais.
                    firstValuesInsideIt.append(
                        str(y).replace("\n", "").replace(' ', '')
                    )  #let's padding it and remove the spaces and the line jumps
                else:  #if not, let's ignore it
                    pass
                cc += 1  #cc increment
        ccc = 0
        whatWeWant = ""
        indexOf = 0
        #more formating
        for b in firstValuesInsideIt:
            if b == "R" and firstValuesInsideIt[ccc + 1] == "$":
                indexOf = firstValuesInsideIt.index(
                    str(firstValuesInsideIt[ccc]))
                break
            ccc += 1

        lastFormatedValues = []
        cccc = 0
        for y in firstValuesInsideIt[indexOf:]:

            lastFormatedValues.append(y)
            try:
                if lastFormatedValues[cccc - 2] == ",":
                    break
            except:
                pass
            cccc += 1
        ccccc = 0
        indexOf2 = 0
        for bb in lastFormatedValues:
            if lastFormatedValues[ccccc] == "R" and lastFormatedValues[
                    ccccc + 1] == "$" and lastFormatedValues[ccccc +
                                                             2] in needed:
                indexOf2 = lastFormatedValues.index(
                    str(lastFormatedValues[ccccc]))
            ccccc += 1
        for z in lastFormatedValues[indexOf2 - 8:]:
            whatWeWant = whatWeWant + z
        #our variable recives our value.
        al = re.text
        productname = al[al.find('<title>') + 7:al.find('</title>')]
        print(productname)

    #let's put it on page:
    return render(request, 'savepage/main.html', {
        'nome': nome,
        'result': result,
        'productname': productname
    })
Exemple #45
0
print("total examples %s" % len(labels))

# split the dataset into training and test datasets
train_x, test_x, train_y, test_y = train_test_split(dataDF['text'],
                                                    dataDF['label'],
                                                    random_state=24,
                                                    test_size=0.2)

# label encode the target variable
encoder = LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

# create a count vectorizer object
count_vect = CountVectorizer(analyzer=lambda x: x)
count_vect.fit(dataDF['text'])

# transform the training and test data using count vectorizer object
xtrain_count = count_vect.transform(train_x)
xtest_count = count_vect.transform(test_x)

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer=lambda x: x)
tfidf_vect.fit(dataDF['text'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xtest_tfidf = tfidf_vect.transform(test_x)
'''
MODELS
'''

## Naive Bayes
Exemple #46
0
                y.append(path)
    return X, y


#Hàm ghi file
def writeFile(txtArrAfter, outputName, path):
    f = open(path + "/" + outputName + ".txt", 'a',encoding="utf-8")
    f.write(str(txtArrAfter))
    f.close()

#Load dữ liệu vào X_data, y_data
train_path = os.path.join(dir_path, 'C:/Users/vai22/OneDrive/Desktop/Data/Train_Full')
X_data, y_data = get_data(train_path)

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(X_data)

# transform the training and validation data using count vectorizer object
X_train_count = count_vect.transform(X_data)



# word level - we choose max number of words equal to 30000 except all words (100k+ words)
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)
tfidf_vect.fit(X_data) # learn vocabulary and idf from training set
X_data_tfidf =  tfidf_vect.transform(X_data)
# assume that we don't have test set before


# writeFile(X_data,'X_data', 'C:/Users/vai22/OneDrive/Desktop/final/Data/vnexpress.net')
# writeFile(y_data,'y_data', 'C:/Users/vai22/OneDrive/Desktop/final/Data/vnexpress.net')
Exemple #47
0
q2_word_test = [x[1] for x in datas_test]
q1_char_train = [x[2] for x in datas_train]
q1_char_dev = [x[2] for x in datas_dev]
q1_char_test = [x[2] for x in datas_test]
q2_char_train = [x[3] for x in datas_train]
q2_char_dev = [x[3] for x in datas_dev]
q2_char_test = [x[3] for x in datas_test]
label_train = [x[4] for x in datas_train]
label_dev = [x[4] for x in datas_dev]
label_test = [x[4] for x in datas_test]

# sklearn extract feature
# feature1: count(csr_matrix)
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b", ngram_range=(
    1, 1))  # token_pattern must remove \w, or single char not counted
vectorizer.fit(q1_word_train + q2_word_train)
q1_train_count = vectorizer.transform(q1_word_train)
q1_dev_count = vectorizer.transform(q1_word_dev)
q1_test_count = vectorizer.transform(q1_word_test)
q2_train_count = vectorizer.transform(q2_word_train)
q2_dev_count = vectorizer.transform(q2_word_dev)
q2_test_count = vectorizer.transform(q2_word_test)

# feature2: binary(csr_matrix)
q1_train_binary = q1_train_count.copy()
q1_dev_binary = q1_dev_count.copy()
q1_test_binary = q1_test_count.copy()
q1_train_binary[q1_train_binary > 0] = 1.0
q1_dev_binary[q1_dev_binary > 0] = 1.0
q1_test_binary[q1_test_binary > 0] = 1.0
q2_train_binary = q2_train_count.copy()
Messages['preprocessed message'] = Messages.apply(
    lambda row: stopWordRemoval(row['lemmantized words']), axis=1)
'''print('\nData Frame after stop word removal: \n', Messages[0:2])'''

TrainingData = pd.Series(list(Messages['preprocessed message']))
TrainingLabel = pd.Series(list(Messages['label']))

## Feature Extraction (Convert the text content into the vector form)
### Bag of Words(BOW) most widely used method for generating features in NLP used for calculating the word frequency which can be used as feature for training a classifier
### TDM (Term Document Matrix) is the matrix that contain the frequencies of occurances of terms in collection of documents, rows correspond documents and columns correspond terms

tfVectorizer = CountVectorizer(ngram_range=(1, 2),
                               min_df=(1 / len(TrainingLabel)),
                               max_df=0.7)
totalDictionaryTDM = tfVectorizer.fit(TrainingData)
messageDataTDM = totalDictionaryTDM.transform(TrainingData)

print(messageDataTDM.shape)

### Term Frequency Inverse Document Frequency (TFIDF), IDF diminishes the weight of most common occuring words and increases the weightage of the rare words

tfIdfVectorizer = TfidfVectorizer(ngram_range=(1, 2),
                                  min_df=(1 / len(TrainingLabel)),
                                  max_df=0.7)
totalDictionaryTFIDF = tfIdfVectorizer.fit(TrainingData)
messageDataTFIDF = totalDictionaryTFIDF.transform(TrainingData)

print(messageDataTFIDF.shape)

## Splitting the training and test data
class SentimentAnalyzer:
    final_model_path = 'data/model_cache/final_model.sav'
    vocabulary_path = 'data/model_cache/vocabulary.pkl'
    data_train_path = 'data/movie_data/full_train.txt'
    data_test_path = 'data/movie_data/full_test.txt'

    def preprocess_reviews(self, reviews):
        REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
        REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
        reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
        reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]

        return reviews

    def load_model(self, train_required):
        if train_required:
            self.train()
        else:
            self.load()

    def load(self):
        self.final_model = pickle.load(
            open(SentimentAnalyzer.final_model_path, 'rb'))
        self.cv = CountVectorizer(binary=True,
                                  vocabulary=pickle.load(
                                      open(SentimentAnalyzer.vocabulary_path,
                                           "rb")))

    def train(self):
        print("Training...")

        # data preprocessing
        reviews_train = []
        for line in open(SentimentAnalyzer.data_train_path,
                         encoding='utf8',
                         mode='r'):
            reviews_train.append(line.strip())

        reviews_test = []
        for line in open(SentimentAnalyzer.data_test_path,
                         encoding='utf8',
                         mode='r'):
            reviews_test.append(line.strip())

        reviews_train_clean = self.preprocess_reviews(reviews_train)
        reviews_test_clean = self.preprocess_reviews(reviews_test)

        # vectorization
        self.cv = CountVectorizer(binary=True)
        self.cv.fit(reviews_train_clean)
        X = self.cv.transform(reviews_train_clean)
        X_test = self.cv.transform(reviews_test_clean)

        # the first 12.5k reviews are positive and the last 12.5k are negative.
        target = [1 if i < 12500 else 0 for i in range(25000)]

        X_train, X_val, y_train, y_val = train_test_split(X,
                                                          target,
                                                          train_size=0.75)

        # choosing the best hyperparameter C which adjust regularization
        accuracy = 0
        c = 0
        for current_c in [0.01, 0.05, 0.25, 0.5, 1]:
            lr = LogisticRegression(C=current_c, solver='liblinear')
            lr.fit(X_train, y_train)
            current_accuracy = accuracy_score(y_val, lr.predict(X_val))
            if current_accuracy > accuracy:
                accuracy = current_accuracy
                c = current_c
            print(f"Accuracy for C = {current_c}: {current_accuracy}")

        print(f"The best accuracy is {accuracy}, C = {c}")

        # train final model
        self.final_model = LogisticRegression(C=c, solver='liblinear')
        self.final_model.fit(X, target)
        print(
            f"Final accuracy is {accuracy_score(target, self.final_model.predict(X_test))}"
        )

        # save final model
        pickle.dump(self.final_model,
                    open(SentimentAnalyzer.final_model_path, 'wb'))
        pickle.dump(self.cv.vocabulary_,
                    open(SentimentAnalyzer.vocabulary_path, "wb"))

    def predict(self, new_review):
        # predict on new review
        new_review_clean = self.preprocess_reviews([new_review])
        X_new_review = self.cv.transform(new_review_clean)
        y_new_review = self.final_model.predict(X_new_review)
        return y_new_review
Exemple #50
0
def lasso(term, year_start=1990, year_end=2016, qa='A', reg_type='lasso'):

    ngram_range=(2, 5)


    vectorizer = CountVectorizer(max_features= 50000, ngram_range=ngram_range, stop_words='english', min_df=5)

    docs_all = document_iterator(type=qa, year_start=year_start, year_end=year_end, format='docs_only',
                                 search_term=term)
    vectorizer.fit(docs_all)

    vocabulary =  vectorizer.get_feature_names()

    vectorizer_plaintiff = TfidfVectorizer(vocabulary=vocabulary, ngram_range=ngram_range, use_idf=True)
    docs_plaintiff = document_iterator(type=qa, year_start=year_start, side_answer='Plaintiff', format='docs_only', search_term=term)
    dtm_plaintiff = vectorizer_plaintiff.fit_transform(docs_plaintiff)

    vectorizer_defendant = TfidfVectorizer(vocabulary=vocabulary, ngram_range=ngram_range, use_idf=True)
    docs_defendant = document_iterator(type=qa, year_start=year_start, side_answer='Defendant', format='docs_only', search_term=term)
    dtm_defendant = vectorizer_defendant.fit_transform(docs_defendant)

    X = vstack([dtm_plaintiff, dtm_defendant])

    y = np.ndarray(shape=(X.shape[0]))

    # Plaintiff docs = 1, defendant docs = 0
    y[:dtm_plaintiff.shape[0]] = 1
    y[dtm_plaintiff.shape[0]:] = 0

    if reg_type == 'ridge':
        alpha = 0.00001
        clf = Ridge(alpha=alpha)
        clf.fit(X, y)
        coeff = clf.coef_

    elif reg_type == 'lasso':
        alpha = 0.0001
        clf = Lasso(alpha=alpha, max_iter=1000)
        clf.fit(X, y)
        coeff = clf.coef_


    elif reg_type == 'logistic':
        alpha=None
        clf = LogisticRegression()
        clf.fit(X, y)
        coeff = clf.coef_[0]

    mse = mean_squared_error(y, clf.predict(X))
    mae = mean_absolute_error(y, clf.predict(X))



    argsorted = np.argsort(coeff)
    min_coef = argsorted[:10]
    max_coef = argsorted[-10:][::-1]



    min_coefs = [(vocabulary[i], coeff[i]) for i in min_coef]
    max_coefs = [(vocabulary[i], coeff[i]) for i in max_coef]

    print "Using {} regression. Mean Squared Error: {}. Mean Absolute Error: {}".format(reg_type, mse, mae)
    print "Samples. Plaintiff: {}. Defendant: {}. Total: {}. Number of tokens: {}".format(dtm_plaintiff.shape[0],
                                                  dtm_defendant.shape[0], X.shape[0], X.shape[1])
    print "Predictors for Defendants:\n{}".format(min_coefs)
    print "\nPredictors for Plaintiffs:\n{}\n\n".format(max_coefs)
Exemple #51
0
def text_classification_tradition():
    train_df = read_data('./data/train_set.csv')
    test_df = read_data('./data/test_a.csv')
    data = pd.concat([train_df, test_df], axis=0)
    print(data.shape)
    """
    传统的文本表示方法:
    1. One-hot
    2. BOW(Bag of Words,词袋表示)
    3. N-gram
    4. TF-IDF
    
    使用sklearn feature_extraction.text里的文本表示接口时,输入格式为:
    corpus = [
    ...     'This is the first document.',
    ...     'This document is the second document.',
    ...     'And this is the third one.',
    ...     'Is this the first document?',
    ... ]
    详见:https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text
    """

    # from sklearn.preprocessing import OneHotEncoder
    #
    # # 语料库里所有单词的集合
    # words_set = set(' '.join(list(data['text'])).split(' '))
    #
    # # 对每个单词编号,得到其索引(这一步也可以用sklearn的LabelEncoder来实现)
    # word2idx = {}
    # idx2word = {}
    # for i, word in enumerate(words_set):
    #     word2idx[word] = i + 1
    #     idx2word[i + 1] = word
    # print(word2idx)
    # """
    # {'6981': 1, '6307': 2, '5367': 3, '1066': 4,...}
    # """
    #
    # # OneHotEncoder输入为shape=(N_words,1)的索引值,输出为各索引值下的one-hot向量word_onehot
    # idx = list(word2idx.values())
    # idx = np.array(idx).reshape(len(idx), -1)
    # print(idx.shape) #(2958, 1)
    # print(idx)
    # """
    # [[   1]
    #  [   2]
    #  [   3]
    #  ...
    #  [2956]
    #  [2957]
    #  [2958]]
    # """
    # onehotenc = OneHotEncoder()
    # onehotenc.fit(idx)
    # word_onehot = onehotenc.transform(idx).toarray()
    # for i, word_onehot_i in enumerate(word_onehot):
    #     print("{0}\t-->\t{1}".format(idx2word[i + 1], word_onehot_i))
    # """
    # 6981	-->	[1. 0. 0. ... 0. 0. 0.]
    # 6307	-->	[0. 1. 0. ... 0. 0. 0.]
    # """
    #
    # # 用法:给定word,找到它的idx,然后从word_onehot里取出对应的one-hot向量
    # x = word_onehot[word2idx['6981']]
    # print(x) #word 6981 的idx 对应的one-hot向量

    # 2. BOW: CountVectorizer
    corpus = data['text'].values
    vectorizer = CountVectorizer(max_features=3000)
    vectorizer.fit(corpus)  #用训练集和测试集的所有语料训练特征提取器
    X_train_all = vectorizer.transform(train_df['text'].values)
    y_train_all = train_df['label'].values
    X_test = vectorizer.transform(test_df['text'].values)

    X_train, X_valid, y_train, y_valid = train_test_split(X_train_all,
                                                          y_train_all,
                                                          test_size=0.1,
                                                          random_state=2020)
    clf = RidgeClassifier()
    clf.fit(X_train, y_train)
    y_valid_pred = clf.predict(X_valid)
    print("f1 score: %.6f" % f1_score(y_valid, y_valid_pred, average='macro'))
    """
    f1 score: 0.820636
    """

    y_test_pred = clf.predict(X_test)
    test_df['label'] = y_test_pred
    print(test_df.shape)  # (50000, 2)
    test_df[['label']].to_csv('./data/submission_bow_20200725.csv',
                              index=False)
    print(test_df['label'].value_counts())
    """
    1     11305
    0     10942
    2      8012
    3      5798
    4      3311
    5      2740
    6      1975
    7      1563
    9      1134
    8      1128
    10     1085
    11      548
    12      322
    13      137
    Name: label, dtype: int64
    """

    # 3. N-gram: CountVectorizer(ngram_range=(1,N))

    # 4. TF-IDF: TfidfVectorizer
    corpus = data['text'].values
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=3000)
    vectorizer.fit(corpus)
    X_train_all = vectorizer.transform(train_df['text'].values)
    y_train_all = train_df['label'].values
    X_test = vectorizer.transform(test_df['text'].values)

    X_train, X_valid, y_train, y_valid = train_test_split(X_train_all,
                                                          y_train_all,
                                                          test_size=0.1,
                                                          random_state=2020)
    clf = RidgeClassifier()
    clf.fit(X_train, y_train)
    y_valid_pred = clf.predict(X_valid)
    print("f1 score: %.6f" % f1_score(y_valid, y_valid_pred, average='macro'))
    """
    f1 score: 0.897664
    """

    y_test_pred = clf.predict(X_test)
    test_df['label'] = y_test_pred
    print(test_df.shape)  #
    test_df[['label']].to_csv('./data/submission_tfidf_20200725.csv',
                              index=False)
    print(test_df['label'].value_counts())
    """
X_train, X_val, y_train, y_val = train_test_split(dataset.content.values,
                                                  y,
                                                  stratify=y,
                                                  shuffle=True,
                                                  random_state=42,
                                                  test_size=0.1)
# print(X_train, X_val)
# print(X_train.shape)
# print(X_val.shape)

tfidf = TfidfVectorizer(max_features=1000, analyzer='word', ngram_range=(1, 3))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.fit_transform(X_val)
# print(X_train_tfidf, X_val_tfidf)
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(dataset['content'])
X_train_count = count_vect.transform(X_train)
X_val_count = count_vect.transform(X_val)
# print(X_train_count, X_val_count)
# print(dataset.head(5))

#### Linear SVM
# text=voice.sesal()
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)


# print('lsvm using count vectors accuracy: %s' % accuracy_score(y_pred, y_val))
def pre_pro(text):
    print("Preprocessing fonksiyoyuna girdi")
Exemple #53
0
# for test
for topic in os.listdir(path):
  pd = os.path.join(path, topic)
  for file in os.listdir(pd):
    pf = os.path.join(pd, file)
    text = open(pf, 'r', encoding="utf8").read()
    X_test10.append(text)
    Y_test10.append(topic)


# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import preprocessing

count_vector = CountVectorizer()
X_tranform10 = count_vector.fit(X_train10)
X_tranform10 = count_vector.transform(X_train10)
# label encoder
le = preprocessing.LabelEncoder()
le.fit(Y_train10)
Y_transform10 = le.transform(Y_train10)

# for validation data
X_valid_transform10 = count_vector.transform(X_valid10)
Y_valid_transform10 = le.transform(Y_valid10)

# for test data
X_test_transform10 = count_vector.transform(X_test10)
Y_test_transform10 = le.transform(Y_test10)

Exemple #54
0
london_text = london_tweets['text'].tolist()
paris_text = paris_tweets['text'].tolist()

# combine all text into one long list of tweets
all_tweets = new_york_text + london_text + paris_text

# create labels for tweets by location; 0 = new york, 1 = london, 2 = paris
labels = [0] * len(new_york_text) + [1] * len(london_text) + [2] * len(
    paris_text)

# divide set into train and test set
train_data, test_data, train_labels, test_labels = train_test_split(
    all_tweets, labels, test_size=0.2, random_state=1)

# create a counter and transform train/test data
counter = CountVectorizer()
counter.fit(train_data + test_data)
train_counts = counter.transform(train_data)
test_counts = counter.transform(test_data)

# create a NB classifier, fit it with the training data and create predictions on the test data to evaluate the model
classifier = MultinomialNB()
classifier.fit(train_counts, train_labels)
predictions = classifier.predict(test_counts)

# classify by using accuracy_score
# print(accuracy_score(test_labels, predictions))

# classify by using confusion matrix
# print(confusion_matrix(test_labels, predictions))
Exemple #55
0
def read_gbif_extract_csv(
    file_name="../data/gbif_extract.csv",
    output_file="../data/gbif_extract_canonicalName_short.csv",
):
    """ load gbif csv data base """
    """ output csv files with short names and key """
    """ used to detect canonical names """
    """ see "search_canonicalName()" function in scispacy_lib.py module """

    important_cols = ["key", "canonicalName"]
    df = pd.read_csv(file_name)
    print(df.shape, list(df.columns))

    print("-important columns:")
    df = df[important_cols]
    print(df.shape, list(df.columns))

    # print(df.head(10))
    for c in list(df.columns):
        if c.find("Key") == -1:
            df[c] = df[c].apply(str).str.lower()
    print(df.shape, list(df.columns))
    print("canonicalName:")
    print(df.head(10))
    df["name_lst"] = df["canonicalName"].apply(lambda x: x.split(" "))
    df["size"] = df["name_lst"].apply(lambda x: len(x))
    print(df["size"].describe())
    ##### output
    print("Output file with short reduce canonical name:", output_file)
    print("- to find canonical name in abstract sci. papers")
    tot_rows = df.shape[0]
    ts = time.time()
    total_read = 0
    total_write = 0
    dct = {}
    with open(output_file, "w") as f:
        f.write("canonicalName_word;tab_key\n")
        for i, r in df.iterrows():
            total_read += 1
            if time.time() - ts > 10:
                ts = time.time()
                print(
                    total_read,
                    "/",
                    tot_rows,
                    round(total_read / tot_rows * 100, 2),
                    "%",
                )
            if r["key"] is None:
                continue
            if r["size"] == 1:
                name = r["canonicalName"]
                # dct[name] = [r['key']]
                # print(name, dct[name], 'name first in')
                f.write(name + ";" + str(r["key"]) + "\n")
                total_write += 1
                continue
            for name in r["name_lst"]:
                f.write(name + ";" + str(r["key"]) + "\n")
                total_write += 1
            continue
        print("Data report:", "total_read", total_read, "total_write",
              total_write)
    print("End file generated:", output_file)

    if False:
        # following code just produced an out of memory
        text = df["canonicalName"].values
        # create the transform
        vectorizer_canonicalName = CountVectorizer()
        # tokenize and build vocab
        vectorizer_canonicalName.fit(text)

        # summarize
        print("len vectorizer.vocabulary_):", len(vectorizer.vocabulary_))
        print("canonicalName:")
        i = 0
        for k, v in vectorizer_canonicalName.vocabulary_.items():
            print(k, v)
            i += 1
            if i > 10:
                break

        # encode document*
        text = "psygmatocerus guianensis"
        vector = vectorizer.transform([text])
        vector.toarray()

        text_out = vectorizer.inverse_transform(vector)
        print(text_out)

        dataX = []
        dataY = []
        for i, r in df_canonicalName.iterrows():
            dataY.append(int(r["key"]))
            v = vectorizer_canonicalName.transform([r["canonicalName"]])
            dataX.append(list(v.toarray()[0]))
        n_patterns = len(dataX)
        print("Total Patterns: ", n_patterns)
    labels.append(content[0])
    texts.append(content[1:])

# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(
    trainDF['text'], trainDF['label'])

# label encode the target variable
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

#%%
"""
Count Vector is a matrix notation of the dataset in which every row represents a 
document from the corpus, every column represents a term from the corpus, and every cell 
represents the frequency count of a particular term in a particular document.
"""
# create a count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)
print('begin')
if __name__ == '__main__':
    # 读取数据
    train = pd.read_csv(r'../../../DATA/TRAIN_1/train.tsv', sep='\t')
    labels = np.array(train['Sentiment'])
    test = pd.read_csv(r'../../../DATA/TRAIN_1/test.tsv', sep='\t')
    print(train.shape)
    print(test.shape)
    train_size = train.shape[0]
    test_size = test.shape[0]
    bag_size = 17441

    # 数据预处理
    ct = CountVectorizer(max_df=0.95, min_df=5, stop_words='english')
    vector = ct.fit(pd.concat([train['Phrase'], test['Phrase']]))
    train_vec = ct.transform(train['Phrase'])
    # test_vec = ct.transform(test['Phrase'])
    print(train_vec.shape)
    # print(test_vec.shape)

    # one_hot = vector.toarray()
    # word_bag = ct.vocabulary_

    train_one_hot = train_vec.toarray()
    # test_one_hot = test_vec.toarray()

    print('train_ont size = ', len(train_one_hot))

    input_size = train_vec.shape[1]
    state = torch.load('../task2/task_1.pt')
Exemple #58
0
list_val = pd.to_numeric(df_val['label'])
list_val = list(list_val)
df_val['label'] = list_val

df_val['cleaned_tweet'] = df_val.tweet.apply(lambda x: ' '.join(
    [word for word in x.split() if not word.startswith('@')]))
df_val['cleaned_tweet'] = df_val.tweet.apply(lambda x: ' '.join(
    [word for word in x.split() if not word.startswith('#')]))
df_val['cleaned_tweet'] = df_val.tweet.apply(lambda x: ' '.join(
    [word for word in x.split() if not word.startswith(' ')]))

tweetVal_train, tweetVal_test, label_train, label_test = train_test_split(
    df_val['tweet'], df_val['label'], test_size=0.25)

cv = CountVectorizer()
vect = cv.fit(tweetVal_train)
tweetVal_train_vec = cv.transform(tweetVal_train)
# tweetVal_train
# print tweetVal_train_vec
# print 'tweetval_train',tweetVal_train
# print 'label_train',label_train

reg = LogisticRegression(random_state=0)
reg.fit(tweetVal_train_vec, label_train)
label_pred = reg.predict(vect.transform(tweetVal_test))
print("accuracy Score :", mt.accuracy_score(label_test, label_pred))
# print("Precisiopn Score:",mt.precision_score(label_test,label_pred))

# cm=confusion_matrix(label_test,label_pred)
# print(cm)
Exemple #59
0
df = pd.read_csv('./datasets/combined_data.csv')
df = df.drop(columns='Unnamed: 0')
df['label'] = df['label'].map({'true': 0, 'false': 1, 'misleading': 1})
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

cvec = CountVectorizer(min_df=2,
                       max_features=5000,
                       ngram_range=(1, 2),
                       stop_words=None)
cvec.fit(X_train)


@app.route('/')
def home():
    return 'Thanks for checking out our misinformation classifier!'


# route 1: show a form to the user
@app.route('/form')
def form():
    # use flask's render_template function to display an html page
    return render_template('form.html')


# route 2: accept the form submission and do something fancy with it
data_x_test=pd.DataFrame()
OHE=OneHotEncoder()
for feature in one_hot_feature:
    OHE.fit(data_x[feature].values.reshape(-1,1))
    train_a=OHE.transform(x_train[feature].values.reshape(-1,1))
    valid_a=OHE.transform(x_valid[feature].values.reshape(-1,1))
    test_a=OHE.transform(data_test[feature].values.reshape(-1,1))
    data_x_train=sparse.hstack((data_x_train,train_a))
    data_x_valid=sparse.hstack((data_x_valid,valid_a))
    data_x_test=sparse.hstack((data_x_test,test_a))
print 'one_hot finish'

CVec=CountVectorizer(analyzer='word',token_pattern=r'(?u)\b\w+\b',tokenizer =lambda x: x.split(' '))
#CVec=CountVectorizer()
for feature in vector_feature:
    CVec.fit(data_x[feature])
    train_a=CVec.transform(x_train[feature])
    valid_a=CVec.transform(x_valid[feature])
    test_a=CVec.transform(data_test[feature])
    data_x_train=sparse.hstack((data_x_train,train_a))
    data_x_valid=sparse.hstack((data_x_valid,valid_a))
    data_x_test=sparse.hstack((data_x_test,test_a))
    df_tmp=pd.DataFrame(CVec.get_feature_names(),columns=['val'])
    #feature important mapping
    df_tmp['feature']='%s' %feature
    df_feature_map=pd.concat([df_feature_map,df_tmp])
print ' countvec finish'
df_feature_map.to_csv(save_path+"feature_important_mapping_cut.csv")

sparse.save_npz(save_path+"data_x_train_cut.npz",data_x_train)
x_train.to_csv(save_path+"x_train_cut.csv",index=None)