Exemple #1
0
def get_words(tweet):  #create a list of all the words in a tweet
    tweet_blob = TextBlob(tweet)
    tweet_blob = tweet_blob.words
    tweet_blob = tweet_blob.singularize()
    tweet_blob = tweet_blob.lemmatize(
    )  #Return a list of the words present in the 50 tweets, lemmatized and singularized
    L = []
    for word in tweet_blob:  #we make sure each word is only present one time in the list
        if word not in L:
            L += [word]
    return L
Exemple #2
0
def parseContents(contentList):
    tupleList = []
    posTagger = OpenNLP("/home/rohith/nitk/apache-opennlp-1.6.0", "POSTagger", "en-pos-maxent.bin")
    chunker = OpenNLP("/home/rohith/nitk/apache-opennlp-1.6.0", "ChunkerME", "en-chunker.bin")
    for item in contentList:
        attr = item[0]
        content = item[1]
        
        content = content.replace('\n','')
        sentences = sent_tokenize(content)
        for sentence in sentences:
            print('#'+sentence, file=sys.stderr)
            extractor = ConllExtractor()
            np = TextBlob(sentence, np_extractor=extractor).noun_phrases
            yield attr, np.lemmatize()
def Tokenization_Stemmer(str1):

    zen = TextBlob(str1)
    zen = zen.words
    zen = zen.lemmatize()

    zen = list(zen)
    for i in range(len(zen)):
        w = Word(zen[i])
        zen[i] = w.lemmatize("v")
    for i in range(len(zen)):
        zen[i] = zen[i].lower()

    zen = sorted(zen)

    return zen
Exemple #4
0
    def createdb(self):
        filepath = '/home/an/Desktop/file.log'
        with open(filepath) as fp:
            line = fp.readline().strip()
            strlist = []
            str1 = ''
            while line:
                # print(len(line.strip()))

                line = fp.readline()
                count = len(line.strip())
                print(count)
                if count == 1:
                    str1 += line.strip()
                else:
                    if len(str1) > 1 and str1.isalpha():
                        str2 = TextBlob(str1)
                        str1 = str2.correct()
                        str2 = Word(str1)
                        str1 = str2.lemmatize()
                        str1 = str(str1.lower())
                        strlist.append(str1)
                    str1 = ''
        fp.close()
        open('/home/an/Desktop/file.log', 'w').close()
        dbfile = shelve.open("dbfile")
        # dbfile.clear()
        global gb
        if gb in list(dbfile.keys()):
            dbfile[gb] += strlist
        else:
            dbfile[gb] = strlist
        # print(list(dbfile.keys()))
        try:
            list1 = dbfile[gb]
            payload = {"username": gb, "tokens": list1}
            r1 = requests.post('http://127.0.0.1:5000/tokenposter',
                               data=json.dumps(payload))
            print(r1.text)
            if r1.text == 'received':
                dbfile[gb] = []

        except Exception as e:
            print(e)
        dbfile.close()
def Tokenization_Stemmer(str1):

    zen = TextBlob(str1)

    zen = zen.words  #分词

    zen = zen.lemmatize()  #名词单复数变原型

    zen = list(zen)
    #动词分词和动名词 变 原型
    for i in range(len(zen)):
        w = Word(zen[i])
        zen[i] = w.lemmatize("v")
    #所有单词变换为小写
    for i in range(len(zen)):
        zen[i] = zen[i].lower()
    zen = sorted(zen)
    return zen