Beispiel #1
0
def get_words(tweet):  #create a list of all the words in a tweet
    tweet_blob = TextBlob(tweet)
    tweet_blob = tweet_blob.words
    tweet_blob = tweet_blob.singularize()
    tweet_blob = tweet_blob.lemmatize(
    )  #Return a list of the words present in the 50 tweets, lemmatized and singularized
    L = []
    for word in tweet_blob:  #we make sure each word is only present one time in the list
        if word not in L:
            L += [word]
    return L
Beispiel #2
0
    def SetUpDataFrameFromFeatures(self):
        gtinList = self.df_GtinList
        featureSet = list(self.SetUpFeatures())
        # productIds = self.IngredientsFrame['Gtin'].astype(str).tolist()
        productIds = gtinList['ProductIDs'].astype(str).tolist()
        # productIds = sorted(productIds)
        print('**********', len(self.IngredientsInString))
        print('$$$$$', len(productIds))
        print('###%%%', len(featureSet))
        occurenceOfIngredientsInData = {}
        ctr = 0
        for i in range(len(self.IngredientsInString)):

            # if(ctr%100 ==0 ): print ('^^^^' ,ctr/100)
            if (ctr % 557 == 0):
                print('ctr: ', ctr, 'productid ', productIds[i],
                      'selfingredients', self.IngredientsInString[i])
            wordCount = len(featureSet) * ['0']
            try:  #world finals B|
                blob = TextBlob(self.IngredientsInString[i])
                blob = blob.words
                blob = blob.singularize()
                blob = sorted(blob)
                for k in blob:
                    if (k in featureSet):
                        index = featureSet.index(k)
                        wordCount[index] = 1
                occurenceOfIngredientsInData.update({productIds[i]: wordCount})
            except Exception as e:
                continue

            # try:
            #     wordCount = len(featureSet)*['0']
            #     for j in range (len(featureSet)):
            #         blob = TextBlob(self.IngredientsInString[i])
            #         blob = blob.words
            #         blob = blob.singularize()
            #         blob = sorted(blob)
            #         if(featureSet[j] in blob):
            #             wordCount.append(1)
            #         else: wordCount.append(0)
            #     occurenceOfIngredientsInData.update({productIds[i]:wordCount})
            # except Exception as e:
            #     continue
            ctr += 1
        toFrame = pd.DataFrame(occurenceOfIngredientsInData).transpose()
        toFrame.columns = self.SetUpFeatures()
        toFrame.index.name = 'ProductIDs'
        toFrame['ProductIDs'] = toFrame.index
        return toFrame
Beispiel #3
0
    if args.ngrams > 1:
        bi_freq = {bigram: len(bi.sent_dict[bigram]) for bigram in bi.bigrams}
        bi_freq = sorted(bi_freq.items(), key=lambda x: x[1], reverse=True)
        filename = os.path.splitext(args.data_file)[0] + '__bi.txt'
        print('Output file: {}'.format(filename))
        with open(os.sep.join([args.working_dir, filename]), 'w') as fp:
            for bigram, freq in bi_freq:
                fp.write('{}: {}\n'.format(bigram, freq))

        if args.ngrams > 2:
            tri_freq = {
                trigram: len(tri.sent_dict[trigram])
                for trigram in tri.trigrams
            }
            tri_freq = sorted(tri_freq.items(),
                              key=lambda x: x[1],
                              reverse=True)
            filename = os.path.splitext(args.data_file)[0] + '__tri.txt'
            print('Output file: {}'.format(filename))
            with open(os.sep.join([args.working_dir, filename]), 'w') as fp:
                for trigram, freq in tri_freq:
                    fp.write('{}: {}\n'.format(trigram, freq))

    print('Done!')

from textblob import TextBlob

txt = 'good'
txtobj = TextBlob(txt)
b = txtobj.singularize()
c = txtobj.detect_language()