def get_words(tweet): #create a list of all the words in a tweet tweet_blob = TextBlob(tweet) tweet_blob = tweet_blob.words tweet_blob = tweet_blob.singularize() tweet_blob = tweet_blob.lemmatize( ) #Return a list of the words present in the 50 tweets, lemmatized and singularized L = [] for word in tweet_blob: #we make sure each word is only present one time in the list if word not in L: L += [word] return L
def parseContents(contentList): tupleList = [] posTagger = OpenNLP("/home/rohith/nitk/apache-opennlp-1.6.0", "POSTagger", "en-pos-maxent.bin") chunker = OpenNLP("/home/rohith/nitk/apache-opennlp-1.6.0", "ChunkerME", "en-chunker.bin") for item in contentList: attr = item[0] content = item[1] content = content.replace('\n','') sentences = sent_tokenize(content) for sentence in sentences: print('#'+sentence, file=sys.stderr) extractor = ConllExtractor() np = TextBlob(sentence, np_extractor=extractor).noun_phrases yield attr, np.lemmatize()
def Tokenization_Stemmer(str1): zen = TextBlob(str1) zen = zen.words zen = zen.lemmatize() zen = list(zen) for i in range(len(zen)): w = Word(zen[i]) zen[i] = w.lemmatize("v") for i in range(len(zen)): zen[i] = zen[i].lower() zen = sorted(zen) return zen
def createdb(self): filepath = '/home/an/Desktop/file.log' with open(filepath) as fp: line = fp.readline().strip() strlist = [] str1 = '' while line: # print(len(line.strip())) line = fp.readline() count = len(line.strip()) print(count) if count == 1: str1 += line.strip() else: if len(str1) > 1 and str1.isalpha(): str2 = TextBlob(str1) str1 = str2.correct() str2 = Word(str1) str1 = str2.lemmatize() str1 = str(str1.lower()) strlist.append(str1) str1 = '' fp.close() open('/home/an/Desktop/file.log', 'w').close() dbfile = shelve.open("dbfile") # dbfile.clear() global gb if gb in list(dbfile.keys()): dbfile[gb] += strlist else: dbfile[gb] = strlist # print(list(dbfile.keys())) try: list1 = dbfile[gb] payload = {"username": gb, "tokens": list1} r1 = requests.post('http://127.0.0.1:5000/tokenposter', data=json.dumps(payload)) print(r1.text) if r1.text == 'received': dbfile[gb] = [] except Exception as e: print(e) dbfile.close()
def Tokenization_Stemmer(str1): zen = TextBlob(str1) zen = zen.words #分词 zen = zen.lemmatize() #名词单复数变原型 zen = list(zen) #动词分词和动名词 变 原型 for i in range(len(zen)): w = Word(zen[i]) zen[i] = w.lemmatize("v") #所有单词变换为小写 for i in range(len(zen)): zen[i] = zen[i].lower() zen = sorted(zen) return zen