Example #1
1
def sentiment_analysis(message):
	actual_range = 2
	final = []
	message = re.sub("(@[A-Za-z0-9]+)|( RT)|( rt)|(\w+:\/\/\S+)"," ",message).strip() #filter usernames,urls
	message = re.sub('#',"",message)
	message = filter(lambda x: x in string.printable, message) #filter non printable characters
	message = HTMLParser.HTMLParser().unescape(message) #unescape html
	tokenized = tokenize(message,puctuation='.!?:')
	tokenized = filter(bool,tokenized)
	tok1=[]
	for index,it in enumerate(tokenized):
		mod = mood(it)
		if '?' in it or mod=='conditional':
			continue
		tok1.append(it.strip())
	score = 0.0
	possed = [re.split(' ',sentence)for sentence in tok1]
	possed = [nltk.pos_tag(sentence) for sentence in possed]
	final = []
	for sentence in possed:
		check = []
		for entry in sentence:
			check.append(list(entry))
		final.append(check)
	range_count=0
	for sentence in final:
		sentence = dictionary_tag(sentence)
		score = score + sentiment_score(sentence)
	return score
Example #2
0
 def __iter__(self):
     if os.path.isdir(self.fname):
         filenames = [
             os.path.join(self.fname, f) for f in os.listdir(self.fname)
         ]
     else:
         filenames = [self.fname]
     for filename in filenames:
         with io.open(filename, encoding='utf-8') as f:
             squad = json.load(f)
             print "Loaded data of len", len(squad['data'])
             for d in squad['data']:
                 if self.mode == "squad":
                     yield [self.begin] + list(d["sentence"]) + [
                         self.middle
                     ] + list(d["question"]) + [self.end], list(
                         d["answer"]) + [self.end]
                 elif self.mode == "squad_word":
                     yield [self.begin
                            ] + tokenize(d["sentence"])[0].split(" ") + [
                                self.middle
                            ] + tokenize(d["question"])[0].split(" ") + [
                                self.end
                            ], tokenize(
                                d["answer"])[0].split(" ") + [self.end]
                 elif self.mode == "squad_ptr":
                     yield [self.begin] + list(d["sentence"]) + [
                         self.middle
                     ] + list(d["question"]) + [self.end], list(
                         d["answer"]) + [self.end]
Example #3
0
def opinioncheck(line):
	sentences = tokenize(line)
	for s in sentences:
		tokens= tokenize(s)
		# print tokens
		for token in tokens:
			for word in token.split():
				if word in poslist:
					posop.append(line)
				if word in neglist:
					negop.append(line)
Example #4
0
    def __iter__(self):
        if os.path.isdir(self.fname):
            filenames = [
                os.path.join(self.fname, f) for f in os.listdir(self.fname)
            ]
        else:
            filenames = [self.fname]
        for filename in filenames:
            with open(filename) as f:
                doc = f.read()
                if self.mode == "oedilf":
                    toks = [self.begin]
                    for i, line in enumerate(doc.split("\n")):
                        if not line: continue
                        line = ''.join([
                            char for char in line.lower()
                            if char in "qwertyuioplkjhgfdsazxcvbnm "
                        ])

                        line_toks = ' '.join(tokenize(line)).split(" ") + [
                            '<br' + str(i) + '>'
                        ]
                        toks += [tok for tok in line_toks if tok != '']
                    yield toks + [self.end]
                if self.mode == "oedilf_rhymes":
                    toks = [self.begin]
                    for i, line in enumerate(doc.split("\n")):
                        if not line: continue
                        line = ''.join([
                            char for char in line.lower()
                            if char in "qwertyuioplkjhgfdsazxcvbnm "
                        ])

                        line_toks = ' '.join(tokenize(line)).split(
                            " ")[-1:] + ['<br' + str(i) + '>']
                        toks += [tok for tok in line_toks if tok != '']
                    yield toks + [self.end]
                if self.mode == "oedilf_s2s":
                    history = []
                    for i, line in enumerate(doc.split("\n")):
                        if not line: continue
                        line = ''.join([
                            char for char in line.lower()
                            if char in "qwertyuioplkjhgfdsazxcvbnm "
                        ])
                        line_toks = ' '.join(tokenize(line)).split(" ") + [
                            '<br' + str(i) + '>'
                        ]
                        line_toks = [tok for tok in line_toks if tok != '']
                        yield [self.begin] + history + [
                            self.end
                        ], line_toks + [self.end]
                        history += line_toks
Example #5
0
 def test_tokenize(self):
     # Assert list with two sentences.
     # The tokenizer should at least handle common abbreviations and
     # punctuation.
     v = en.tokenize("The cat is eating (e.g., a fish). Yum!")
     self.assertEqual(v, ["The cat is eating ( e.g. , a fish ) .", "Yum !"])
     print("pattern.en.tokenize()")
    def form_sentences(self, text_block, block_id, remove_stopwords=False,
                       stem=True, form_tagged_doc=True):
        """
        parse a block of text a form a list of word tokenized sentences
        :param text_block : single block of text as string
        :param block_id: id of the text block
        :param id : id of the text_block, used for hdfs storage
        :param remove_stopwords: remove the stopwords from the text
        :param stem: stem the words to root form
        :param form_tagged_doc: form a tagged document for the Doc2vec model
        """
        sentences = pattern.tokenize(text_block.lower())
        sentences = [sentence.replace('\'', '').replace('(', ' ').replace(')', ' ') \
                         .replace("/", " or ").replace("-", "") for sentence in sentences]
        sentences = [self.sentence_func(TAG_RE.sub('', sentence)) for sentence in sentences]

        l_stemmer = lambda w: self.stemmer(w) if stem else w
        sentences = [[l_stemmer(w) for w in word_tokenize(sentence)
                      if self.__word_filter(w, remove_stopwords)] for sentence in sentences]

        if not form_tagged_doc:
            return sentences

        sentences = [TaggedDocument(words=words, tags=[str(block_id) + ' ' + str(index)])
                     for index, words in enumerate(sentences)]

        for sentence in sentences:
            self.doc_tags[sentence.tags[0]] = sentence

        return sentences
Example #7
0
def word_ranking(text, n='L2'):
    """
    extract most relevant sentences from text according to LSA algorithm
    steps:    
    1. tokenize text by sentences
    2. compute tfidf matrix
    3. applying SVD of tfidf matrix (reduce to n-dimensions) 
    4. ranking sentences according to cross-method (source: http://www.aclweb.org/anthology/C10-1098.pdf)
        
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    #==============================================================================
    #     #synctatic filter
    #     exclude_list = []
    #     for sent in sentences:
    #         for word, pos in tag(sent):
    #             if pos != "JJ" or pos != 'NN': # Retrieve all adjectives and nouns.
    #                 exclude_list.append(word.lower())
    #==============================================================================

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dimensions number equal to euclidean norm of singular values
    # U, S, Vt = np.linalg.svd(m.vectors, full_matrices=False)
    # dimensions=int(round(np.linalg.norm(S, 2)))
    m.reduce(dimensions=n)

    # sentences selection according to cross-method
    # source: http://www.ceng.metu.edu.tr/~e1395383/papers/TextSummarizationUsingLSA(Journal).pdf
    # topic(rows) x tokens(cols) matrix(tfidf)
    V = np.array(m.lsa.vt)

    # average sentence score for each concept/topic by the rows of the Vt matrix
    avg_score = np.mean(V, axis=1).reshape((-1, 1))

    # cell values which are less than or equal to the average score are set to zero
    V[V <= avg_score] = 0.0

    # sigma natrix after svd performing
    S = np.array(m.lsa.sigma).reshape((-1, 1))

    # total length of each sentence vector
    length = np.sum(V * S, axis=0)

    # ranking words by length score
    ranking = Counter(dict(zip(m.lsa.terms, length)))  #.most_common(n)

    #words, score =  list(zip(*ranking))

    return ranking
Example #8
0
 def sentence_walk(self):
     output = []
     sents = tokenize(self.source_text)
     words = set(search.hypernym_search(self.source_text, "artifact"))
     pat = re.compile(" " + "|".join(words) + " ")
     sents = [s for s in sents if pat.search(s) != None]
     pprint(sents)
Example #9
0
def dispersion(text, keywords):
    """
    Dispersion of occurence of given keywords among given text
    - text: string 
    - keywords: list of keywords
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    # tokenize by words
    tokens = []
    for sent in sentences:
        for w in sent.lower().split():
            tokens.append(w)

    n_tokens = len(tokens)
    n_words = len(keywords)
    disp = []

    for x in range(n_tokens):
        for y in range(n_words):
            if tokens[x] == keywords[y]:
                disp.append((x, y))

    x, y = list(zip(*disp))
    return x, y
Example #10
0
 def test_tokenize(self):
     # Assert list with two sentences.
     # The tokenizer should at least handle common abbreviations and
     # punctuation.
     v = en.tokenize("The cat is eating (e.g., a fish). Yum!")
     self.assertEqual(v, ["The cat is eating ( e.g. , a fish ) .", "Yum !"])
     print("pattern.en.tokenize()")
Example #11
0
def summarize(text_to_summarize):
    stokens = tokenize(text_to_summarize)
 
    # STEP 1
    # pattern.vector's Document is a nifty bag-o-words structure,
    # with a TF weighting scheme
    docs = [Document(string= s, name=e,stemmer=LEMMA)
            for e,s in enumerate(stokens) if len(s.split(" ")) > 7]
    
    linkgraph = []
    # STEP 2 and 3 happen interwovenly
    for doc in docs:
        for doc_copy in docs:
            if doc.name != doc_copy.name:
                # STEP 2 happens here
                wordset_a = [x[1] for x in doc.keywords()]
                wordset_b = [y[1] for y in doc_copy.keywords()]
                jacc_dist = distance.jaccard(wordset_a, wordset_b)
                if jacc_dist < 1:
                    linkgraph.append((str(doc.name), #index to sentence
                                      str(doc_copy.name),1-jacc_dist)) #dist. score
    # By the time we reach here, we'd have completed STEP 3
    
    # STEP 4
    #I referenced this SO post for help with pagerank'ing
    #http://stackoverflow.com/questions/9136539/how-to-weighted-edges-affect-pagerank-in-networkx
    D=nx.DiGraph()
    D.add_weighted_edges_from(linkgraph)
    pagerank = nx.pagerank(D)
    sort_pagerank = sorted(pagerank.items(),key=operator.itemgetter(1))
    sort_pagerank.reverse()
    top2 = sort_pagerank[:2]
    orderedtop2 = [int(x[0]) for x in top2]
    orderedtop2 = sorted(orderedtop2)
    return " ".join([ stokens[i] for i in orderedtop2 ])
Example #12
0
def summarize(text, n=2):
    """
    determine most informative sentences by summarizing words ranks
    which occure in the corresponding  sentences
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    # tokenize sentence list by words
    words_sent = [sent.lower().split() for sent in sentences]

    # words ranking
    w_ranking = word_ranking(text, n)

    # sents ranking = sum of words score
    s_ranking = defaultdict(int)

    for i, sent in enumerate(words_sent):
        for word in sent:
            if word in w_ranking:
                s_ranking[i] += w_ranking[word]

    # placed sents ranking into high-performance container
    s_ranking = Counter(s_ranking)

    # get top n sents indexes with scores
    sents_idx = s_ranking.most_common(n)

    output = [sentences[j[0]] for j in sents_idx]

    # reordering
    output.sort(lambda s1, s2: text.find(s1) - text.find(s2))

    return ' '.join(output)
 def do_POST(self):
     form = cgi.FieldStorage(fp=self.rfile,
                             headers=self.headers,
                             environ={
                                 'REQUEST_METHOD': 'POST',
                                 'CONTENT_TYPE':
                                 self.headers['Content-Type'],
                             })
     if self.path != '/predict' or 'text' not in form.keys():
         self.send_response(404)
         self.end_headers()
         return 404
     self.send_response(200)
     self.send_header("Content-type", 'text/plain')
     self.end_headers()
     text = ' '.join(
         tokenize(
             re.sub('([a-z][.!?]+)([A-Z])', '\g<1> \g<2>',
                    form['text'].value, 0))).lower().split()
     x = [[w2indx.get(word, 0) for word in text]]
     x = sequence.pad_sequences(x,
                                maxlen=200,
                                padding='post',
                                truncating='post')
     predict = model.predict_classes(x)[0][0]
     self.wfile.write(bytes(LBL[predict], encoding='utf8'))
     return 200
Example #14
0
def sentiment_analysis(message):
    actual_range = 2
    final = []
    message = re.sub("(@[A-Za-z0-9]+)|( RT)|( rt)|(\w+:\/\/\S+)", " ",
                     message).strip()  #filter usernames,urls
    message = re.sub('#', "", message)
    message = filter(lambda x: x in string.printable,
                     message)  #filter non printable characters
    message = HTMLParser.HTMLParser().unescape(message)  #unescape html
    tokenized = tokenize(message, puctuation='.!?:')
    tokenized = filter(bool, tokenized)
    tok1 = []
    for index, it in enumerate(tokenized):
        mod = mood(it)
        if '?' in it or mod == 'conditional':
            continue
        tok1.append(it.strip())
    score = 0.0
    possed = [re.split(' ', sentence) for sentence in tok1]
    possed = [nltk.pos_tag(sentence) for sentence in possed]
    final = []
    for sentence in possed:
        check = []
        for entry in sentence:
            check.append(list(entry))
        final.append(check)
    range_count = 0
    for sentence in final:
        sentence = dictionary_tag(sentence)
        score = score + sentiment_score(sentence)
    return score
    def clean_text(text):
        """
        :param text: text as str
        :return: list of sentences
        """

        try:
            text = text.strip()
            if text:
                final_sentences = []
                token_text = tokenize(text)

                for sentence in token_text:
                    words = sentence.split()
                    cleaned_tokens = [
                        porter_stemmer.stem(word) for word in words
                        if word not in punctuation
                    ]
                    cleaned_sent = " ".join(cleaned_tokens)
                    cleaned_sent = CleanTextProcessor.clean_not_words(
                        cleaned_sent)
                    cleaned_sentence = cleaned_sent + "."
                    final_sentences.append(cleaned_sentence)
                return final_sentences
            else:
                return []
        except:
            trace_err = StackTrace.get_stack_trace()
            msg = "CleanTextProcessor (clean_text()) : %s%s" % ("\n",
                                                                trace_err)
            log.error(msg)
            raise Exception(msg)
Example #16
0
def summarize(text, n=1):
    """
    extract most relevant sentences from text according to TextRank algorithm
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dict of TextRank ranking of cosine similarity matrix
    ranking = utils.textrank(m.documents, m.distance)

    # indexes of top n sentences
    top_sents_idx, _ = list(zip(*ranking.most_common(n)))

    # reordering
    output = [sentences[i] for i in sorted(top_sents_idx)]

    return ''.join(output)
Example #17
0
def keywords(text, n=15):
    """
    extract most relevant keywords from given text
    steps:    
    1. tokenize text by words
    2. applying synctatic filter
    3. compute pairwise levenshtein distance
    4. create graph based on cosine distance matrix
    5. compute pagerank
    
    - text: string consisting of a few sentences
    - n: number of keywords to extract
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    #synctatic filter
    words = []
    for sent in sentences:
        for word, pos in tag(sent):
            if pos == "JJ" or pos == 'NN':  # Retrieve all adjectives and nouns.
                words.append(word.lower())

    # dict of TextRank ranking of levenshtein distance matrix
    ranking = utils.textrank(words, utils.levenshtein)

    # top n keywords
    keywords, scores = list(zip(*ranking.most_common(n)))
    return keywords, scores
 def sentence_walk(self):
     output = []
     sents = tokenize(self.source_text)
     words = set(search.hypernym_search(self.source_text, 'artifact'))
     pat = re.compile(' ' + '|'.join(words) + ' ')
     sents = [s for s in sents if pat.search(s) != None]
     pprint(sents)
Example #19
0
def ngrams(text, n=1, lowercase=False):
    for s in tokenize(text):
        if lowercase:
            s = s.lower()
        s = s.split()
        for i in xrange(n):
            for j in xrange(len(s)-i):
                yield ' '.join(s[j:j+i+1])
Example #20
0
def split_text_to_list_of_sentences(raw_text):
    """ Split the raw text into list of sentences.
        Args:
            raw_text (str): text input in paragraphs.
        Returns:
            (list): list of str of sentences.
    """
    return tokenize(raw_text)
Example #21
0
    def key_sentences(self):
        words = set(search.hypernym_search(self.source_text, "instrumentality"))
        sents = tokenize(self.source_text)
        pat = re.compile(" " + "|".join(words) + " ")
        sents = [s for s in sents if pat.search(s) != None]

        pprint(sents)
        pprint(words)
Example #22
0
def test_findTonkens_3():
    s = "I eat pizza with a fork."
    s = "Bachelor's degree in Computer Science or equivalent"
    s = "B.S. in Computer Science, a related degree or its equivalent "     
    s = "What's this? This is a book."  
    from pattern.en import tokenize     
    result = tokenize(s)
    print result
def split_text_to_list_of_sentences(raw_text):
    """ Split the raw text into list of sentences.
        Args:
            raw_text (str): text input in paragraphs.
        Returns:
            (list): list of str of sentences.
    """
    return tokenize(raw_text)
Example #24
0
def test_findTonkens_3():
    s = "I eat pizza with a fork."
    s = "Bachelor's degree in Computer Science or equivalent"
    s = "B.S. in Computer Science, a related degree or its equivalent "
    s = "What's this? This is a book."
    from pattern.en import tokenize
    result = tokenize(s)
    print result
Example #25
0
def sentance_break(origin_text):
    """ Input: output text from gutenberg_text_gather
		Output: tokenized text, a list of strings 
		where the strings are the sentances 
	"""
    text = tokenize(
        origin_text,
    )  # using patter to break string of text apart in to a list of strings where each string is a sentace
    return text
Example #26
0
 def form_sentences(self):
     f_p = open(CORPUS_FILE, "rbU")
     corpus_sentences = pattern.tokenize(f_p.read())
     f_p.close()
     self.sentences = defaultdict(list)
     for sentence in corpus_sentences:
         for v in VERBS:
             if sentence.find(" " + v + " ") != -1:
                 self.sentences[v].append(sentence)
    def key_sentences(self):
        words = set(search.hypernym_search(self.source_text,
                                           'instrumentality'))
        sents = tokenize(self.source_text)
        pat = re.compile(' ' + '|'.join(words) + ' ')
        sents = [s for s in sents if pat.search(s) != None]

        pprint(sents)
        pprint(words)
Example #28
0
 def __iter__(self):
     if os.path.isdir(self.fname):
         filenames = [os.path.join(self.fname,f) for f in os.listdir(self.fname)]
     else:
         filenames = [self.fname]
     for filename in filenames:
         with open(filename) as f:
             doc = f.read()
             if self.mode == "ohhla":
                 toks = [self.begin]
                 for line in doc.split("\n"):
                     if not line: continue
                     toks +=  ' '.join(tokenize(line)).split(" ") + ['<br>']
                 yield toks + [self.end]
             elif self.mode == "ohhla_line_pairs":
                 lines = [tokenize(line) for line in doc.split("\n")]
                 for l1, l2 in zip(lines, lines[1:]):
                     inp_toks = [self.begin] + ' '.join(l1).split(" ") + [self.end]
                     outp_toks = ' '.join(l2).split(" ") + [self.end]
                     yield (inp_toks, outp_toks)
Example #29
0
def tokenize_pattern(text):
    """
    The tokenize() function returns a list of sentences, with punctuation marks split from words.
    """
    sents = tokenize(text, punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_«»…".decode("utf8"), replace={})
    """
    Возвращает список предложений вида
    Теперь , в 2014 году , голая Дженнифер Лоуренс появилась в Интернете за полтора месяца до всемирной премьеры первой части последней серии трилогии « Голодные игры : Сойка-пересмешница » ( The Hunger Games : Mockingjay – Part 1 ) .
    """
    tokens = [token.lower() for sent in sents for token in sent.split()]
    log.debug("Tokenize with Pattern")
    return tokens
Example #30
0
    def __call__(self, org_doc):

        doc = org_doc

        tokens = doc.lower().split()
        ldoc = ' '.join([x for x in tokens if "_" not in x])

        # Identify which phrases were used
        keywords = [key for key in self.X if key in ldoc]
        punctuation = ".,;:!?()[]{}`''\"@#$^&*+-|=~"

        # Loop over the keywords and replace them one-by-one.
        # This is inefficient, but less error prone.

        parsed_sent = []

        for sent in tokenize(doc, punctuation=punctuation):

            for word in keywords:
                word_n_tokens = len(word.split())

                new_word = self.X[word]
                word_tokens = word.split()

                # Check if the substring tokens match
                tokens = sent.lower().split()
                mask = contains_sublist(tokens, word_tokens)
                while any(mask):
                    idx = mask.index(True)
                    sent = sent.split()
                    args = sent[:idx] + [
                        new_word,
                    ] + sent[idx + word_n_tokens:]
                    sent = ' '.join(args)
                    tokens = sent.lower().split()
                    mask = contains_sublist(tokens, word_tokens)

            parsed_sent.append(sent)

        doc = ' '.join(parsed_sent)
        """
        # Change the punctuation to a more readable format for debugging
        punc_compress = ''').,?!':'''
        for punc in punc_compress:
            doc = doc.replace(' '+punc,punc)

        punc_compress = '''('''
        for punc in punc_compress:
            doc = doc.replace(punc+' ',punc)
        """

        return doc
    def __call__(self,org_doc):

        doc = org_doc

        tokens = doc.lower().split()
        ldoc = ' '.join([x for x in tokens if "_" not in x])

        # Identify which phrases were used
        keywords = [key for key in self.X if key in ldoc]
        punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~"           
        
        # Loop over the keywords and replace them one-by-one.
        # This is inefficient, but less error prone.

        parsed_sent = []

        for sent in tokenize(doc, punctuation=punctuation):
            
            for word in keywords:
                word_n_tokens = len(word.split())
                worn_n = len(word)
                
                new_word = self.X[word]
                word_tokens = word.split()

                # Check if the substring tokens match
                tokens = sent.lower().split()
                mask = contains_sublist(tokens, word_tokens)
                while any(mask):
                    idx = mask.index(True)
                    sent = sent.split()
                    args = sent[:idx] + [new_word,] + sent[idx+word_n_tokens:]
                    sent = ' '.join(args)
                    tokens = sent.lower().split()
                    mask = contains_sublist(tokens, word_tokens)

            parsed_sent.append(sent)
        
        doc = ' '.join(parsed_sent)

        """
        # Change the punctuation to a more readable format for debugging
        punc_compress = ''').,?!':'''
        for punc in punc_compress:
            doc = doc.replace(' '+punc,punc)

        punc_compress = '''('''
        for punc in punc_compress:
            doc = doc.replace(punc+' ',punc)
        """

        return doc
Example #32
0
    def __call__(self, data):
        splitted_body = self.get_enrichment(data, 'sentence_splitter')

        tokenized = []
        for paragraph in splitted_body:
            if 'content' in paragraph and paragraph['content']:
                # Tokenize the splitted sentences and
                # join potential sentence splits detected by pattern
                tokenized_sentences = [' '.join(tokenize(s))
                                       for s in paragraph['content']]
                tokenized.append({'content': tokenized_sentences, 'type': paragraph['type']})

        return self.add_enrichment(data, self.name, tokenized)
Example #33
0
 def __iter__(self):
     for root, dirs, files in os.walk(self.dirname):
         for filename in files:
             file_path = root + '/' + filenam
             for line in open(file_path):
                 sline = line.strip()
                 if sline == "":
                     continue
                 rline = cleanhtml(sline)
                 tokenized_line = ' '.join(tokenize(rline))
                 is_alpha_word_line = [word for word in
                                       tokenized_line.lower().split()
                                       if word.isalpha()]
                 yield is_alpha_word_line
 def __iter__(self):
     for root, dirs, files in os.walk(self.dirname):
         for filename in files:
             file_path = root + '/' + filename
             for line in open(file_path):
                 sline = line.strip()
                 if sline == "":
                     continue
                 rline = clean_html(sline)
                 tokenized_line = r' '.join(tokenize(rline))
                 is_alpha_word_line = [
                     word
                     for word in jieba.cut(tokenized_line, cut_all=False)
                     if word.isalpha()
                 ]
                 yield is_alpha_word_line
    def form_sentences(self, text_block, remove_stopwords=False, stem=True):
        """
        parse a block of text a form a list of word tokenized sentences 
        :param text_block : single block of text as string 
        :param id : id of the text_block, used for hdfs storage
        :param remove_stopwords: remove the stopwords from the text
        :param stem: stem the words to root form 
        """
        sentences = pattern.tokenize(text_block.lower())
        sentences = [sentence.replace('\'', '').replace('(', ' ').replace(')', ' ') \
                         .replace("/", " or ").replace("-", "") for sentence in sentences]
        sentences = [self.sentence_func(TAG_RE.sub('', sentence)) for sentence in sentences]

        l_stemmer = lambda w: self.stemmer(w) if stem else w
        sentences = [[l_stemmer(w) for w in word_tokenize(sentence)
                      if self.__word_filter(w, remove_stopwords)] for sentence in sentences]
        return sentences
Example #36
0
def _transform_file(file_path, w2id, split_par=False, debug=False):
    """
    Transforms a file containing articles into a 4D list of words divided into sentences,
    paragraphs and docs. Write the result to disk with the name filename_clean.pklz
    :param file_path: file to transform
    """
    if debug:
        print("Cleaning %s" % file_path)
    with open(file_path) as f:
        data = f.read().decode("latin-1")
        docs = data.split("</doc>")
        del data
    if not split_par:
        file_out = "%s_clean_simple" % file_path
    else:
        file_out = "%s_clean_paragraph" % file_path
    file_string = ""
    for doc in [d.strip() for d in docs if d.strip()]:
        paragraphs = [
            tokenize(par)
            for par in remove_title(cleanhtml(doc)).strip().split("\n\n")
            if par
        ]
        doc_a = False
        for p in paragraphs:
            par_a = False
            for sent in p:
                line = [
                    word for word in sent.lower().split()
                    if word.isalpha() or is_number(word)
                ]

                line = " ".join([known(word, w2id) for word in line])
                if line:
                    file_string += line + " <eos> "
                    par_a = True

            if par_a and split_par:
                file_string += " <eop> "

    VectorManager.write_string(file_out, file_string.encode("latin-1"))
    del file_string
    if debug:
        print("Done with %s" % file_path)
Example #37
0
def _transform_file(file_path, debug=False):
    """
    Transforms a file containing articles into a 4D list of words divided into sentences,
    paragraphs and docs. Write the result to disk with the name filename_wl (words list)
    :param file_path: file to transform
    """
    if debug:
        print("Cleaning %s" % file_path)
    with open(file_path) as f:
        raw = f.read().decode("latin-1")
        data = cleanhtml(raw)
        docs = data.split("</doc>")
        del data
    file_out = "%s_wl" % file_path
    file_string = ""
    for doc in [d.strip() for d in docs if d.strip()]:
        paragraphs = [
            tokenize(par)
            for par in remove_title(cleanhtml(doc)).strip().split("\n\n")
            if par
        ]
        doc_a = False
        for p in paragraphs:
            par_a = False
            for sent in p:
                line = " ".join([
                    word for word in sent.lower().split()
                    if word.isalpha() or is_number(word)
                ])
                if line:
                    file_string += line + "\n"
                    par_a = True
                    doc_a = True

            if par_a:
                file_string += "\n"
        if doc_a:
            file_string += "\n"

    VectorManager.write_string(file_out, file_string.encode("latin-1"))
    del file_string
    if debug:
        print("Done with %s" % file_path)
	def get_raw_text(self):
		""" gutenberg_text_gather take a text from gutenberg url
			and stores it to a file. It only pulls from gutenberg 
			when given the command True. By default the command is False. 
			This function outputs self.raw_text, which 
			is a tokenized text file of my gutenberg book. 
		""" 
		if self.command: # If I tell it to load data from url
			buddhist_psalm_text = URL(self.url).download()

			# Save data to a file (will be part of your data fetching script)
			f = open('buddhist_psalm_text.pickle','wb')
			pickle.dump(buddhist_psalm_text,f)
			f.close()

		# Load data from a file (will be part of your data processing script)
		input_file = open('buddhist_psalm_text.pickle','rb')
		# Use pattern to break string of text in to a list of strings where each string is a sentace 
		self.raw_text = tokenize(pickle.load(input_file),) 
Example #39
0
    def form_sentences(self,
                       text_block,
                       block_id,
                       remove_stopwords=False,
                       stem=True,
                       form_tagged_doc=True):
        """
        parse a block of text a form a list of word tokenized sentences
        :param text_block : single block of text as string
        :param block_id: id of the text block
        :param id : id of the text_block, used for hdfs storage
        :param remove_stopwords: remove the stopwords from the text
        :param stem: stem the words to root form
        :param form_tagged_doc: form a tagged document for the Doc2vec model
        """
        sentences = pattern.tokenize(text_block.lower())
        sentences = [sentence.replace('\'', '').replace('(', ' ').replace(')', ' ') \
                         .replace("/", " or ").replace("-", "") for sentence in sentences]
        sentences = [
            self.sentence_func(TAG_RE.sub('', sentence))
            for sentence in sentences
        ]

        l_stemmer = lambda w: self.stemmer(w) if stem else w
        sentences = [[
            l_stemmer(w) for w in word_tokenize(sentence)
            if self.__word_filter(w, remove_stopwords)
        ] for sentence in sentences]

        if not form_tagged_doc:
            return sentences

        sentences = [
            TaggedDocument(words=words,
                           tags=[str(block_id) + ' ' + str(index)])
            for index, words in enumerate(sentences)
        ]

        for sentence in sentences:
            self.doc_tags[sentence.tags[0]] = sentence

        return sentences
def text_sentiment(text):
    if not text:
        return default_sentiment
    sentences = tokenize(plaintext(text))
    sentiments = [sentiment(s) for s in sentences]
    average_polarity = np.mean([s[0] for s in sentiments])
    std_polarity = np.std([s[0] for s in sentiments])
    average_subjectivity = np.mean([s[1] for s in sentiments])
    std_subjectivity = np.std([s[1] for s in sentiments])

    if math.isnan(average_polarity):
        average_polarity = 0.0
    if math.isnan(std_polarity):
        std_polarity = 0.0
    if math.isnan(average_subjectivity):
        average_subjectivity = 0.0
    if math.isnan(std_subjectivity):
        std_subjectivity = 0.0

    return Sentiment(average_polarity, std_polarity, average_subjectivity, std_subjectivity, len(sentences))
    def form_sentences(self, text_block, remove_stopwords=False, stem=True):
        """
        parse a block of text a form a list of word tokenized sentences 
        :param text_block : single block of text as string 
        :param id : id of the text_block, used for hdfs storage
        :param remove_stopwords: remove the stopwords from the text
        :param stem: stem the words to root form 
        """
        sentences = pattern.tokenize(text_block.lower())
        sentences = [sentence.replace('\'', '').replace('(', ' ').replace(')', ' ') \
                         .replace("/", " or ").replace("-", "") for sentence in sentences]
        sentences = [
            self.sentence_func(TAG_RE.sub('', sentence))
            for sentence in sentences
        ]

        l_stemmer = lambda w: self.stemmer(w) if stem else w
        sentences = [[
            l_stemmer(w) for w in word_tokenize(sentence)
            if self.__word_filter(w, remove_stopwords)
        ] for sentence in sentences]
        return sentences
Example #42
0
def summarize(text, sentence_count=2):
    sentence_list = tokenize(text)

    # each document's name is the sentence's original index
    # so that we can put them back together later
    docs = [Document(string=sentence, name=index, stemmer=LEMMA)
            for index, sentence in enumerate(sentence_list)]

    graph = Graph()
    for doc_a, doc_b in combinations(docs, 2):
        wordset_a = [x[1] for x in doc_a.keywords()]
        wordset_b = [y[1] for y in doc_b.keywords()]
        similarity = 1 - jaccard(wordset_a, wordset_b)
        if similarity > 0:
            graph.add_edge(doc_a.name, doc_b.name, weight=similarity)

    ranked_sentence_indexes = pagerank(graph).items()
    sentences_by_rank = sorted(
        ranked_sentence_indexes, key=itemgetter(1), reverse=True)
    best_sentences = map(itemgetter(0), sentences_by_rank[:sentence_count])
    best_sentences_in_order = sorted(best_sentences)

    return ' '.join(sentence_list[index] for index in best_sentences_in_order)
Example #43
0
def test_tokenize():
    from pattern.en import tokenize   
    
    sent = "Randstad Technologies - Baltimore , MD - June 2014 to Present Responsibilities Johns Hopkins University , Krieger School of Arts & Sciences June 2014 - present Input Content for websites using the WordPress interface Modified and configured WordPress plug-ins and themes to match design Created Email template for Dean 's Newsletter Launched website and created redirects using .htaccess and Apache conf file"
    lines =  tokenize(sent)
    print lines
    def form_relations(self, text, block_id, payload, ff, persist=True):
        """
        form relation(s) on a given text
        :param text: text on which to get the relations on,
        text will be sentence tokenized and relations formed at sentence level
        :param block_id: unique identifier of the block
        :param persist: persist the relations extracted from the text in the sink,
        relation_sink needed to be specified
        :return: list of relations
        """
        text_sentences = pattern.tokenize(text)
        relations = []
        for sentence in text_sentences:

            # work with ascii string only
            sentence = "".join((c for c in sentence if 0 < ord(c) < 127))
            try:
                senna_annotation = self.relation_annotator.getAnnotations(sentence)
            except Exception as e:
                logger.error(e)
                continue

            chunk_parse, pos_tags, role_labeling, tokenized_sentence = \
                senna_annotation['chunk'], senna_annotation['pos'], senna_annotation['srl'], \
                senna_annotation['words']

            # nothing to do here empty srl
            if not role_labeling: continue

            for semantic_element in role_labeling:
                arguments = RelationExtractor.__populate_arguments(semantic_element)
                modifiers = RelationExtractor.__populate_modifier(semantic_element)
                verb = semantic_element.get('V')
                # order of the arguments returned is important, A0 --> A1 --> A2 --> A3
                arguments = [v for v in vars(arguments).itervalues() if v]
                modifiers = [v for v in vars(modifiers).itervalues() if v]

                if not arguments: continue
                argument_pairs = [e for e in ((ai, aj) for i, ai in enumerate(arguments) for j, aj
                                              in enumerate(arguments) if i < j)]

                verb = relation_util.normalize_relation(verb)

                for a0, a1 in argument_pairs:
                    en0 = relation_util.form_entity(tokenized_sentence, a0, chunk_parse, pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence, a1, chunk_parse, pos_tags)
                    if not en0 or not en1: continue
                    relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb,
                                                   sentence=sentence, text=text, block_id=block_id,
                                                   payload=payload, ff = ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

                for arg_modifier in modifiers:
                    mod_pos = sentence.find(arg_modifier)
                    linked_arg = min([(a, abs(mod_pos - sentence.find(a))) for a in arguments], key=lambda e: e[1])[0]
                    en0 = relation_util.form_entity(tokenized_sentence, linked_arg, chunk_parse, pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence, arg_modifier, chunk_parse, pos_tags)
                    if not en0 or not en1: continue
                    relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb,
                                                   sentence=sentence, text=text, block_id=block_id,
                                                   payload=payload, ff=ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

        return relations
Example #45
0
for line in neg:
	for v in line.split("\n"):
		if v:
			if v[0] != ';':
				neglist.append(v.strip())

print poslist
print neglist
print lines

poslist = filter(None, poslist)
neglist = filter(None, neglist)

for line in lines:
		sentences = tokenize(line)
		for s in sentences:
			tokens= tokenize(s)
			for word in tokens:
				if word in poslist:
					posop.append(line)
				elif word in neglist:
					negop.append(line)

posop=list(set(posop))
negop=list(set(negop))

print "positive"
for p in posop:
	print p
print "negative"
Example #46
0
for line in neg:
    for v in line.split("\n"):
        if v:
            if v[0] != ';':
                neglist.append(v.strip())

print poslist
print neglist
print lines

poslist = filter(None, poslist)
neglist = filter(None, neglist)

for line in lines:
    sentences = tokenize(line)
    for s in sentences:
        tokens = tokenize(s)
        for word in tokens:
            if word in poslist:
                posop.append(line)
            elif word in neglist:
                negop.append(line)

posop = list(set(posop))
negop = list(set(negop))

print "positive"
for p in posop:
    print p
print "negative"
Example #47
0
def splitSentences(text):
  #  return nltk.tokenize.sent_tokenize(text)
  # use pattern package
    return tokenize(text) 
Example #48
0
    verb=' '.join(bits_to_words(basic_sentence['VP']))

    if verb=='is':
        return "What is "+sbj.lower()+"? "+obj

    return "What does "+sbj.lower()+" "+lemma(verb.lower())+"?"+" "+obj







text="""
A star is a massive ball of plasma (very hot gas) held together by gravity. It radiates energy because of the nuclear reactions inside it

It radiates heat and light, and every other part of the electromagnetic spectrum, such as radio waves, micro-waves, X-rays, gamma-rays and ultra-violet radiation. The proportions vary according to the mass and age of the star.

The energy of stars comes from nuclear fusion. This is a process that turns a light chemical element into another heavier element. Stars are mostly made of hydrogen and helium. They turn the hydrogen into helium by fusion. When a star is near the end of its life, it begins to change the helium into other heavier chemical elements, like carbon and oxygen. Fusion produces a lot of energy. The energy makes the star very hot. The energy produced by stars radiates away from them. The energy leaves as electromagnetic radiation.
"""
sentences=tokenize(text)
basic_sentences=[]
for sentence in sentences:
    print sentence
    basic_sentences=basic_sentences+gather_question_bits(sentence)

basic_sentences=convert_pp(basic_sentences)
for sentence in basic_sentences:
    print basic_sentence_to_question(sentence)

Example #49
0
#refer to http://textminingonline.com/getting-started-with-pattern

from pattern.en import tokenize

f = """this’s pattern word tokenize"""
print "tokens:", tokenize(f)
sent_tokenize_test = """Tokenization is the process of breaking a stream of text up into words, phrases, symbols, or other meaningful elements called tokens. The list of tokens becomes input for further processing such as parsing or text mining. Tokenization is useful both in linguistics (where it is a form of text segmentation), and in computer science, where it forms part of lexical analysis."""
print "sentence:",tokenize(sent_tokenize_test)


from pattern.en import tag

g = """In corpus linguistics, part-of-speech tagging (POS tagging or POST), also called grammatical tagging or word-category disambiguation, is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech, based on both its definition, as well as its context—i.e. relationship with adjacent and related words in a phrase, sentence, or paragraph. A simplified form of this is commonly taught to school-age children, in the identification of words as nouns, verbs, adjectives, adverbs, etc."""
tagged_result = tag(g)

print tagged_result


from pattern.en import referenced
referenced('book')

from pattern.en import singularize
singularize('wolves')

from pattern.en import comparative
comparative('bad')
#‘worse’

from pattern.en import superlative

          text = str(text).replace('\xf7','').replace('\xc3\xba','').replace('\xb6','').replace('\xa9','').replace('\xe2\x99\xaa','')
          text = str(text).replace('\xc3\xaf','').replace('\x5c','').replace('\xf1','').replace('\xe1','').replace('\xe7','').replace('\xfa','')
          text = str(text).replace('\xf3','').replace('\xed','').replace('\xe9','').replace('\xe0','').replace('\xae','').replace('\xc2','')
          text = str(text).replace('\xc3','').replace('\xa2','').replace('\xbf','')
#         print text
      except IndexError:
          print line
          continue

# G. Remove clearly wrong unicode characters -- BOM, NULL (only utf8 hex works)
      line = str(line).replace('\x00 ','').replace('\xef\xbf\xbd','')
      print line,

# H. Ensure the text is split into sentences
     # tokenize(string, punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_", replace={})
      for sentence in tokenize(text):
         all = ""

# I. Select the parser
         if sentence.isupper() or sentence.islower(): st = UPP
         else: st = Mix

# J. Parts of speech with stanford-ner via pyner
         reply = st.get_entities(sentence)
         # {u'PERSON': [u'Bill Clinton'], u'LOCATION': [u'U.S.'], u'O': [u'was President of the']}
         try:
             for tup in reply.items():
                names = ""
                if tup[0] == "O" or not tup[0] : continue
                for name in tup[1]:
                   names = "".join([names,"/",name])