Example #1
0
def tokenize_merge(row):
    allwords = []
    for text in row.iloc[1:].dropna():
        text = text.lstrip("b\'").lstrip("b\"").lstrip("b\'''")
        s = Sentence.from_raw(text,StopWords,neg_mark=True)
        allwords += s.words

    print allwords# show progress
    return allwords
Example #2
0
def test_negation_suffix():
    stopwords = common.make_stop_words()
    sentences = [   "I don't like Beijing 123, because it's too expensive", "I cannot 4 run away 56, since I am a grown man",
                    "never ever come back again, I swear to god","without any problem","I don't think I will enjoy it: it might be too spicy" ]
    for index,raw_sent in enumerate(sentences):
        sentence = Sentence.from_raw(raw_sent,stopwords)
        print "\n=========================== [{}]".format(index+1)
        print sentence.raw
        print sentence.words
Example #3
0
def tokenize_merge(row):
    allwords = []
    for text in row.iloc[1:].dropna():
        text = text.lstrip("b\'").lstrip("b\"").lstrip("b\'''")
        s = Sentence.from_raw(text, StopWords, neg_mark=True)
        allwords += s.words

    print allwords  # show progress
    return allwords
def preproc_save_sentences(filename,raw_sent_stream,extra_stopwords = None):
    stop_words = set(stopwords.words("english"))
    if extra_stopwords is not None:
        stop_words |= set(extra_stopwords)

    with open(filename,"wt") as outf:
        outf.write("[")

        for index,raw_sent in enumerate( raw_sent_stream):
            prev_terminator = '\n' if index ==0 else ',\n'
            sentence = Sentence.from_raw(raw_sent,stop_words)
            if len(sentence.words)>0:
                outf.write(prev_terminator + sentence.dump_json())
                print "{}-th sentence processed and saved".format(index+1)

        outf.write("\n]")
Example #5
0
def test_sentence():
    stopwords = text_utility.make_stop_words()

    texts = [
        "can't is a contraction", "she isn't my wife any more",
        "I am not in USA right now", "I'm a Chinese",
        "1630 NE Valley Rd, Pullman, WA, 99163, Apt X103",
        "I should've done that thing I didn't do", "I don't love her any more",
        "I want to divorce without hesitation", "bye, Pullman, bye, USA"
    ]

    for index, text in enumerate(texts):
        sent = Sentence.from_raw(text, stopwords, True)
        print "\n******************** {}".format(index + 1)

        print sent.raw
        print "===>"
        print sent.words
Example #6
0
def test_sentence():
    stopwords = text_utility.make_stop_words()

    texts = [   "can't is a contraction",
                "she isn't my wife any more",
                "I am not in USA right now",
                "I'm a Chinese",
                "1630 NE Valley Rd, Pullman, WA, 99163, Apt X103",
                "I should've done that thing I didn't do",
                "I don't love her any more",
                "I want to divorce without hesitation",
                "bye, Pullman, bye, USA"]

    for index,text in enumerate(texts):
        sent = Sentence.from_raw(text,stopwords,True)
        print "\n******************** {}".format(index+1)

        print sent.raw
        print "===>"
        print sent.words
Example #7
0
def print_topics(txt):
    sentence = Sentence.from_raw(txt,stop_words)
    print "\n{}\n".format(sentence.raw)

    coded_words = wordcoder.code(sentence.words)
    bow = dictionary.doc2bow(coded_words)

    topic_distribution = lda_model[bow]
    topic_distribution.sort(key=lambda t: t[1], reverse=True)

    tags = None
    for index, (topic_id, topic_percentage) in enumerate(topic_distribution):
        mt = MixTopic(topic_mapping[topic_id])
        mt.weight(topic_percentage)

        if tags is None:
            tags = mt
        else:
            tags.add(mt)

    tags.normalize()
    print tags
Example #8
0
def update_add_neg_suffix(dbname,query_condition):
    stop_words = common.make_stop_words()
    client = MongoClient()
    review_collection = client[dbname]['reviews']

    cursor = review_collection.find(query_condition,{"sentences.raw":1,"sentences.words":1})
    for rindex,rd in enumerate(cursor):
        review = Review.from_dict(rd)

        update_content = {}
        for sindex,sent in enumerate(review.sentences):
            new_sent = Sentence.from_raw(sent.raw,stop_words)
            if set(new_sent.words) != set(sent.words):
                update_content["sentences.{}.words".format(sindex)] = new_sent.words

        if len(update_content)>0:
            result = review_collection.update_one({"_id":review.id},{"$set":update_content})
            if result.modified_count != 1:
                raise Exception("failed to update review<{}>".format(review.id))

        print "{}-th review updated {} sentences".format(rindex+1,len(update_content))

    client.close()
Example #9
0
 def __init__(self, id=None, text=None, is_positive=None):
     self.id = id
     self.sent = None if text is None else Sentence.from_raw(
         text, Review.StopWords, neg_mark=True)
     if self.sent is not None:
         self.sent.sentiment = is_positive
Example #10
0
 def __init__(self,id = None,text = None,is_positive=None):
     self.id = id
     self.sent = None if text is None else Sentence.from_raw(text,Review.StopWords,neg_mark=True)
     if self.sent is not None:
         self.sent.sentiment = is_positive
Example #11
0
 def assign_comment(self,text,stop_words):
     self.sentences = []
     for raw_sentence in Review.SentTokenizer.tokenize(text):
         sent = Sentence.from_raw(raw_sentence,stop_words)
         if len(sent.words) >0:
             self.sentences.append(sent)