def get_words(tweet): #create a list of all the words in a tweet tweet_blob = TextBlob(tweet) tweet_blob = tweet_blob.words tweet_blob = tweet_blob.singularize() tweet_blob = tweet_blob.lemmatize( ) #Return a list of the words present in the 50 tweets, lemmatized and singularized L = [] for word in tweet_blob: #we make sure each word is only present one time in the list if word not in L: L += [word] return L
def SetUpDataFrameFromFeatures(self): gtinList = self.df_GtinList featureSet = list(self.SetUpFeatures()) # productIds = self.IngredientsFrame['Gtin'].astype(str).tolist() productIds = gtinList['ProductIDs'].astype(str).tolist() # productIds = sorted(productIds) print('**********', len(self.IngredientsInString)) print('$$$$$', len(productIds)) print('###%%%', len(featureSet)) occurenceOfIngredientsInData = {} ctr = 0 for i in range(len(self.IngredientsInString)): # if(ctr%100 ==0 ): print ('^^^^' ,ctr/100) if (ctr % 557 == 0): print('ctr: ', ctr, 'productid ', productIds[i], 'selfingredients', self.IngredientsInString[i]) wordCount = len(featureSet) * ['0'] try: #world finals B| blob = TextBlob(self.IngredientsInString[i]) blob = blob.words blob = blob.singularize() blob = sorted(blob) for k in blob: if (k in featureSet): index = featureSet.index(k) wordCount[index] = 1 occurenceOfIngredientsInData.update({productIds[i]: wordCount}) except Exception as e: continue # try: # wordCount = len(featureSet)*['0'] # for j in range (len(featureSet)): # blob = TextBlob(self.IngredientsInString[i]) # blob = blob.words # blob = blob.singularize() # blob = sorted(blob) # if(featureSet[j] in blob): # wordCount.append(1) # else: wordCount.append(0) # occurenceOfIngredientsInData.update({productIds[i]:wordCount}) # except Exception as e: # continue ctr += 1 toFrame = pd.DataFrame(occurenceOfIngredientsInData).transpose() toFrame.columns = self.SetUpFeatures() toFrame.index.name = 'ProductIDs' toFrame['ProductIDs'] = toFrame.index return toFrame
if args.ngrams > 1: bi_freq = {bigram: len(bi.sent_dict[bigram]) for bigram in bi.bigrams} bi_freq = sorted(bi_freq.items(), key=lambda x: x[1], reverse=True) filename = os.path.splitext(args.data_file)[0] + '__bi.txt' print('Output file: {}'.format(filename)) with open(os.sep.join([args.working_dir, filename]), 'w') as fp: for bigram, freq in bi_freq: fp.write('{}: {}\n'.format(bigram, freq)) if args.ngrams > 2: tri_freq = { trigram: len(tri.sent_dict[trigram]) for trigram in tri.trigrams } tri_freq = sorted(tri_freq.items(), key=lambda x: x[1], reverse=True) filename = os.path.splitext(args.data_file)[0] + '__tri.txt' print('Output file: {}'.format(filename)) with open(os.sep.join([args.working_dir, filename]), 'w') as fp: for trigram, freq in tri_freq: fp.write('{}: {}\n'.format(trigram, freq)) print('Done!') from textblob import TextBlob txt = 'good' txtobj = TextBlob(txt) b = txtobj.singularize() c = txtobj.detect_language()