Example #1
0
def get_region_owner(wordlist, region_indicator):
    """ Given a list of tokens (as Quepy Words), return a SAST (sub-)tree
        representing the containment relations (and any connectives)
        between the regions in the list (as identified by region_indicator).
    """
    #print "get_region_owner wordlist", wordlist
    #print "get_region_owner region_indicator", region_indicator

    # NOTE: This treats ands/ors as binary operators.
    tokenlist = [x.token.lower() for x in wordlist]
    if 'and' in tokenlist:
        andidx = tokenlist.index('and')
        # NOTE: We currently treat 'and's as disjunctions
        return IsOrOp() \
             + HasPart( get_region_owner(wordlist[0:andidx], region_indicator) ) \
             + HasPart( get_region_owner(wordlist[andidx + 1:], region_indicator) )
    elif 'or' in tokenlist:
        oridx = tokenlist.index('or')
        return IsOrOp() \
             + HasPart( get_region_owner(wordlist[0:oridx], region_indicator) ) \
             + HasPart( get_region_owner(wordlist[oridx + 1:], region_indicator) )
    else:
        if tokenlist[0] == 'in':
            wordlist = wordlist[1:]
        previous_region = None
        for n in finditer(Question(Lemma('not')) + region_indicator, wordlist):
            r, s = n.span()
            region_name = ' '.join([comp.token
                                    for comp in wordlist[r:s]]).lower()
            current_region = get_name_expression(region_name)

            if previous_region:
                current_region = current_region + HasSubregion(previous_region)
            previous_region = current_region
        return previous_region
Example #2
0
 def has_specific_service_question(clauses):
     #具体类型 的 服务service
     print('\n与企业具体服务规则匹配')
     select = "?siname ?sidesc"
     sparql = None
     matches = []
     keyword = None
     for i, clause in enumerate(clauses):
         print('问题子句', i, ':')
         for index, sW in enumerate(listW_service):
             for m in finditer(sW, clause):
                 i, j = m.span()
                 matches.extend(clause[i:j])
                 if len(matches) != 0:
                     keyword = keywords_service[index]
             if keyword is not None:
                 break
         for w in clause:
             if w.pos == pos_company:
                 e = "?s vocab:company_chName '{company_name}'."\
                         "?s vocab:hasServiceType ?st."\
                         "?st vocab:hasService ?service."\
                         "?service vocab:service_name '{service_name}'."\
                         "?service vocab:hasServiceItem ?si."\
                         "?si vocab:serviceitem_name ?siname."\
                         "?si vocab:serviceitem_description ?sidesc".format(company_name=w.token,service_name=w.token+'-'+keyword)
                 sparql = SPARQL_SELECT_TEM.format(prefix=SPARQL_PREXIX,
                                                   select=select,
                                                   expression=e)
                 break
     return sparql
Example #3
0
 def test_finditer1(self):
     tab = self.a + self.b
     regex = tab * (2, None)
     strregex = re.compile("(?:ab){2,}")
     xs = list(refo.finditer(regex, self.seq))
     strxs = list(strregex.finditer(self.string))
     self._eq_list_n_stuff(xs, strxs)
def parse_element_into_books(html_elements):
    # Based on https://github.com/machinalis/refo/blob/master/examples/xml_reader.py
    is_header = lambda elem: elem.get('class').startswith('bookMain')
    is_highlight = lambda elem: elem.get('class').startswith('highlightRow')
    regex = Group(Predicate(is_header) + Plus(Predicate(is_highlight)), 'book')
    groups = [html_elements[g['book'][0]:g['book'][1]] for g in finditer(regex, html_elements)]
    return [Book(group) for group in groups]
Example #5
0
 def test_finditer1(self):
     tab = self.a + self.b
     regex = tab * (2, None)
     strregex = re.compile("(?:ab){2,}")
     xs = list(refo.finditer(regex, self.seq))
     strxs = list(strregex.finditer(self.string))
     self._eq_list_n_stuff(xs, strxs)
    def apply(self, sentence):
        matches = []
        for m in finditer(self.condition, sentence):
            i, j = m.span()
            matches.extend(sentence[i:j])

        return self.action(matches), self.condition_num
Example #7
0
 def apply(self, sentence):
     matches = []
     #拿着分好的词(整个输入的句子)   去跟 规则进行匹配    只留下匹配到的词(中间可能包含其他杂项)
     #如果一个句子中包含多个符合规则的短句,则将多个子句返回给action函数 但是只处理第一个子句
     for m in finditer(self.condition, sentence):
         i, j = m.span()
         matches.extend([sentence[i:j]])
     return self.action(matches), self.condition_num
Example #8
0
 def apply(self, sentence):
     matches = []
     for m in finditer(self.condition, sentence):
         i, j = m.span()
         matches.extend(sentence[i:j])
     if __name__ == '__main__':
         pass
     return self.action(matches)
Example #9
0
 def apply(self, sentence):
     matches = []
     for m in finditer(self.condition, sentence):
         i, j = m.span()
         print(i, j)
         matches.extend(sentence[i:j])
     if __name__ == '__main__':
         print("----------applying %s----------" % self.action.__name__)
     return self.action(matches)
Example #10
0
 def test_finditer2(self):
     tab = self.a + self.b
     regex = tab * (2, None) + refo.Group(refo.Plus(self.b), "foobar")
     strregex = re.compile("(?:ab){2,}(b+)")
     xs = list(refo.finditer(regex, self.seq))
     strxs = list(strregex.finditer(self.string))
     xs = [x.group("foobar") for x in xs]
     strxs = [x.span(1) for x in strxs]
     self.assertListEqual(xs, strxs)
Example #11
0
 def test_finditer2(self):
     tab = self.a + self.b
     regex = tab * (2, None) + refo.Group(refo.Plus(self.b), "foobar")
     strregex = re.compile("(?:ab){2,}(b+)")
     xs = list(refo.finditer(regex, self.seq))
     strxs = list(strregex.finditer(self.string))
     xs = [x.group("foobar") for x in xs]
     strxs = [x.span(1) for x in strxs]
     self.assertListEqual(xs, strxs)
 def apply(self, sentence):
     matches = []
     for m in finditer(self.condition, sentence):
         i, j = m.span()
         matches.extend(sentence[i:j])
     if len(matches) == 0:
         return None
     else:
         return self.action()
Example #13
0
 def apply(self, sentence):
     matches = []
     # finditer返回一个可迭代对象
     for m in finditer(self.condition, sentence):
         # i,j 为字符串的起始位置
         i, j = m.span()
         matches.extend(sentence[i:j])
     print('matchs:')
     print(self.action(matches), self.condition_num)
     return self.action(matches), self.condition_num
Example #14
0
def parse_element_into_books(html_elements):
    # Based on https://github.com/machinalis/refo/blob/master/examples/xml_reader.py
    is_header = lambda elem: elem.get('class').startswith('bookMain')
    is_highlight = lambda elem: elem.get('class').startswith('highlightRow')
    regex = Group(Predicate(is_header) + Plus(Predicate(is_highlight)), 'book')
    groups = [
        html_elements[g['book'][0]:g['book'][1]]
        for g in finditer(regex, html_elements)
    ]
    return [Book(group) for group in groups]
Example #15
0
    def apply(self, word_list):
        #因为可能满足条件的有多处,所以用matches列表存储
        matches = []
        # 用条件去找匹配的词汇,finditer 里面用到了yeild,就是每次找到一个结果返回一次,继续找
        # 可以理解为finditer 返回的值可以迭代
        for m in finditer(self.condition, word_list):
            i, j = m.span()
            matches.extend(word_list[i:j])  # 提取出被匹配的句子区间划出,其中可能有其他杂词汇

        return self.action(matches), self.condition_num
Example #16
0
 def apply(self, sentence):
     matches = []
     for m in finditer(self.condition, sentence):
         i, j = m.span()
         matches.extend(sentence[i:j])
     if len(matches) == 0:
         return None, None
     else:
         # for i in matches:
         #     print i.token, i.pos
         return self.action(matches), self.condition_num
Example #17
0
 def apply(self, sentence):
     matches = []
     for m in finditer(self.condition, sentence):
         # print 1
         i, j = m.span()
         # print 1, i,j
         matches.extend(sentence[i:j])
     if __name__ == '__main__':
         print "----------applying %s----------" % self.action.__name__
     # print matches
     return self.action(matches)
Example #18
0
def interpret_ClearSomeCommand(self, match):
    command = IsCommand("clear")
    if getattr(match, "clear_quant", None):
        # TODO: Don't use finditer; just do a search
        for m in finditer(Pos("CD"), match.clear_quant):
            i, j = m.span()
            # TODO: Join with a space? Is the list ever longer than 1, anyway?
            num = ' '.join([c.token for c in match.clear_quant[i:j]])
            command = command + HasEqualTo(num)
    else:
        command = command + HasEqualTo("1")
    return command, "enum"
Example #19
0
def build_mod_tree(wordlist, synaptic_to):
    """ Given a list of tokens (as Quepy Words), return a SAST (sub-)tree
        representing the detected neuron attributes ("modifiers")
        and any connectives.
        synaptic_to represents the phrase which this sequence is "(pre-/post-)synaptic to".
    """
    #print "build_mod_tree wordlist", wordlist
    #print "build_mod_tree synaptic_to", synaptic_to
    tokenlist = [x.token.lower() for x in wordlist]
    # We make multiple passes, but "hopefully" the modifier list is not that long.
    for idx, token in enumerate(tokenlist):
        if (token == 'and' or token == 'or') and len(tokenlist) > idx + 2:
            # NOTE: We assume something would come after the following 'and'/'or'.
            if tokenlist[idx + 2] == 'and':
                # NOTE: We currently treat 'and's as disjunctions
                return IsOrOp() \
                     + HasPart( build_mod_tree(wordlist[0:idx + 2], synaptic_to) ) \
                     + HasPart( build_mod_tree(wordlist[idx + 3:], synaptic_to) )
            elif tokenlist[idx + 2] == 'or':
                return IsOrOp() \
                     + HasPart( build_mod_tree(wordlist[0:idx + 2], synaptic_to) ) \
                     + HasPart( build_mod_tree(wordlist[idx + 3:], synaptic_to) )
    # If no such ("explicit") 'and'/'or' nodes are found, look for "silent and" nodes.
    for idx, token in enumerate(tokenlist):
        if (token == 'and' or token == 'or') and len(tokenlist) > idx + 2:
            # Based on the scan above, we can assume the idx+2 is not 'and'/'or'.
            # So we treat it as a "silent and".
            return IsAndOp() \
                 + HasPart( build_mod_tree(wordlist[0:idx + 2], synaptic_to) ) \
                 + HasPart( build_mod_tree(wordlist[idx + 2:], synaptic_to) )
    # If we've made it this far, we can assume there's zero or one 'and'/'or's.
    mods = []
    for n in finditer(Predicate(lowercase_is_in(modifiers_and_regions)),
                      wordlist):
        r, s = n.span()
        mod_name = tokenlist[r:s][0]
        mods.append(mod_name)
    if len(mods) == 1:
        return get_name_expression(mods[0], synaptic_to)
    # NOTE: (A reminder..:) We assume 'and'/'or' are not biologically relevant terms...
    # NOTE: We currently treat 'and's as disjunctions
    if 'or' in tokenlist or 'and' in tokenlist:
        op = IsOrOp()
    else:
        # NOTE: We assume juxtaposition of modifiers signifies logical 'and'
        op = IsAndOp()
    for mod in mods:
        op += HasPart(get_name_expression(mod, synaptic_to))
    return op
Example #20
0
    def has_company_basicinfo_question(clauses):
        print('\n与企业属性规则匹配')
        #公司属性
        select = "?x"
        sparql = None
        for i, clause in enumerate(clauses):
            print('问题子句', i, ':')
            keyword = None
            matches = []
            for index, cbW in enumerate(listW_company_basic):
                for m in finditer(cbW, clause):
                    i, j = m.span()
                    matches.extend(clause[i:j])
                    if len(matches) != 0:
                        keyword = keyWord_company_baisc[index]
                if keyword is not None:
                    break

            for w in clause:
                if w.pos == pos_company:
                    if keyword == 'company_description':
                        #                        select = "?x ?y"
                        #                        e = "?s vocab:company_chName '{company_name}'."\
                        #                            "?s vocab:company_baidubaikeDescription ?x."\
                        #                            "?s vocab:company_kuaidi100Description ?y.".format(company_name=w.token)
                        select = "?y"
                        e = "?s vocab:company_chName '{company_name}'."\
                            "?s vocab:company_kuaidi100Description ?y.".format(company_name=w.token)
                    else:
                        e = "?s vocab:company_chName '{company_name}'."\
                            "?s vocab:{keyword} ?x.".format(company_name=w.token, keyword=keyword)
                    sparql = SPARQL_SELECT_TEM.format(prefix=SPARQL_PREXIX,
                                                      select=select,
                                                      expression=e)
                    break
        return sparql
Example #21
0
 def apply(self, sentence):
     for m in finditer(self.condition, sentence):
         i, j = m.span()
         return self.action(sentence[i:j])
 def apply(self, word_objects):
     matches = []
     for m in finditer(self.condition, word_objects):
         i, j = m.span()
         matches.extend(word_objects[i:j])
     return self.action(matches), self.condition_weight, self.description
Example #23
0
parser.add_argument("filename", action="store")
cfg = parser.parse_args()
text = open(cfg.filename).read()


from refo import finditer, Predicate, Literal, Any, Group, Star


def notin(xs):
    return lambda x: x not in xs


name = Predicate(notin("/")) + Star(Predicate(notin(" >")))
name = Group(name, "name")
inside = name + Star(Any(), greedy=False)
opentag = Literal("<") + inside + Literal(">")
opentag = Group(opentag, "open")
closetag = Literal("<") + Literal("/") + inside + Literal(">")
closetag = Group(closetag, "close")
regex = closetag | opentag

depth = 0
for m in finditer(regex, text):
    if "open" in m:
        i, j = m["name"]
        print "  " * depth + text[i:j]
        depth += 1
    else:
        assert "close" in m
        depth -= 1
Example #24
0
 def apply(self, sentence):
     for m in finditer(self.condition, sentence):
         i, j = m.span()
         if "victim" in m:
             i, j = m.span("victim")
         self.action(sentence[i:j])
Example #25
0
 def apply(self, sentence):
     for m in finditer(self.condition, sentence):
         i, j = m.span()
         if "victim" in m:
             i, j = m.span("victim")
         self.action(sentence[i:j])
def main():
    ##set_file_name = raw_input('Enter a file name: ')
    file_name = raw_input('Enter a file name: ')
    test_file = open(file_name, 'r')
    rawtext = test_file.read()
    ##GET ALL KEYWORDS
    get_all_keywords = []
    #Extract title from text
    title = get_title(rawtext)
    first_sen = get_first_sen(rawtext)
    
    #Get paragraph without title
    para_list = rawtext.splitlines()[1:] #in list
    para_string = ''.join(para_list) #convert to string
    
    #Prettify paragraph
    prettify_txt = re.sub(r'[^\w.]', ' ', para_string)
    mod_txt = remov_stopword(prettify_txt)
    
    #Tokenizing & POS Tagging
    token_txt = nltk.sent_tokenize(mod_txt) #Line Segment
    num_sent = len(token_txt) #Number of sentences
    token_word = [nltk.word_tokenize(sent) for sent in token_txt]
    pos_tag = [nltk.pos_tag(sent) for sent in token_word]
    
    
    ##print title
    print "Sentence: ", num_sent
    print '\n'
    
    #Chunk and print NP
    get_nouns = [[Word(*x) for x in sent] for sent in pos_tag]

    #NNP Rules
    rule_0 = W(pos = "NNS")| W(pos = "NN") | W(pos = "NNP")
    rule_05 = W(pos = "NNP") + W(pos = "NNS")
    rule_1 = W(pos = "WP$") + W(pos = "NNS")
    rule_2 = W(pos = "CD") + W(pos = "NNS")
    rule_3 = W(pos = "NN") + W(pos = "NN")
    rule_4 = W(pos = "NN") + W(pos = "NNS")
    rule_5 = W(pos = "NNP") + W(pos = "CD")
    rule_6 = W(pos = "NNP") + W(pos = "NNP")
    rule_7 = W(pos = "NNP") + W(pos = "NNPS")
    rule_8 = W(pos = "NNP") + W(pos = "NN")
    rule_9 = W(pos = "NNP") + W(pos = "VBZ")
    rule_10 = W(pos = "DT") + W(pos = "NNS")
    rule_11 = W(pos = "DT") + W(pos = "NN")
    rule_12 = W(pos = "DT") + W(pos = "NNP")
    rule_13 = W(pos = "JJ") + W(pos = "NN")
    rule_14 = W(pos = "JJ") + W(pos = "NNS")
    rule_15 = W(pos = "PRP$") + W(pos = "NNS")
    rule_16 = W(pos = "PRP$") + W(pos = "NN")
    rule_02 = W(pos = "NN") + W(pos = "NN") + W(pos = "NN")
    rule_17 = W(pos = "NN") + W(pos = "NNS") + W(pos = "NN")
    rule_18 = W(pos = "NNP") + W(pos = "NNP") + W(pos = "NNP")
    rule_19 = W(pos = "JJ") + W(pos = "NN") + W(pos = "NNS")
    rule_20 = W(pos = "PRP$") + W(pos = "NN") + W(pos = "NN")
    rule_21 = W(pos = "DT") + W(pos = "JJ") + W(pos = "NN")
    rule_22 = W(pos = "DT") + W(pos = "CD") + W(pos = "NNS")
    rule_23 = W(pos = "DT") + W(pos = "VBG") + W(pos = "NN")
    rule_24 = W(pos = "DT") + W(pos = "NN") + W(pos = "NN")
    rule_25 = W(pos = "NNP") + W(pos = "NNP") + W(pos = "VBZ")
    rule_26 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NN")
    rule_27 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NNP")
    rule_28 = W(pos = "DT") + W(pos = "JJ") + W(pos = "NN")
    rule_29 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NNP") + W(pos = "NNP")
    rule_30 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NN") + W(pos = "NN") 

    NP_bi_gram_set = (rule_05)|(rule_1)|(rule_2)|(rule_3)|(rule_4)|(rule_5)|(rule_6)|(rule_7)|(rule_8)|(rule_9)|(rule_10)|(rule_11)|(rule_12)|(rule_13)|(rule_14)|(rule_15)|(rule_16)
    NP_tri_gram_set = (rule_02)|(rule_17)|(rule_18)|(rule_19)|(rule_20)|(rule_21)|(rule_22)|(rule_23)|(rule_24)|(rule_25)|(rule_26)|(rule_27)|(rule_28)
    NP_quard_gram_set = (rule_29)|(rule_30)

    #Rule set function
    get_uni_gram = (rule_0)
    get_bi_gram = NP_bi_gram_set
    get_tri_gram = NP_tri_gram_set
    get_quard_gram = NP_quard_gram_set

    bag_of_NP = []
    bag_of_biNP = []
    bag_of_triNP = []
    bag_of_fourNP = []
    total__tfidf = 0
    ###################################GET UNIGRAMS###################################
    ##print "UNIGRAM -->"
    for k, s in enumerate(get_nouns):
        for match in finditer(get_uni_gram, s):
            x, y = match.span() #the match spans x to y inside the sentence s
            #print pos_tag[k][x:y]
            bag_of_NP += pos_tag[k][x:y]
            
    ###############      
    #Term Frequency for unigrams    
    ##print "\nUnigram Feature Matrices:"
    total__tfidf = 0
    uni_tfidf_values = ''
    str_uni_grams = ''
    total_docs = count_total_corpus()
    fdist = nltk.FreqDist(bag_of_NP)
    print fdist
    ##STORE UNIGRAMS
    unzipped_uni = zip(*bag_of_NP)
    str_unigrams = list(unzipped_uni[0])
    get_unigrams = zip(str_unigrams,str_unigrams[1:])[::1]
    ###############
    
    ##UNI MAXIMUM TermScore##
    scores = []
    for word in fdist:
        score = fdist[word]
        scores.append(score)
    max_uni = max(scores)
    ######################
 
    for word in fdist:
        fq_word = fdist[word]
        ##print '%s->%d' % (word, fq_word)
        get_tf = term_frequency(fq_word, max_uni)

        ### FEATURES ###
        ##Tuple to String##
        to_string = ':'.join(word)
        get_this_string = convert_to_string(to_string)

        ##DF Score
        num_of_doc_word = count_nterm_doc(get_this_string)
        ##
        ##TF.IDF Score
        idf_score = inverse_df(total_docs, num_of_doc_word)
        tf_idf_scr = get_tf * idf_score
        total__tfidf += tf_idf_scr

        ##GET EACH UNIGRAMS TFIDF
        uni_tfidf_scr = repr(tf_idf_scr)+' '
        uni_tfidf_values += uni_tfidf_scr
        str_uni_grams += get_this_string+','

    ##BUILD DICT FOR EACH TERMS
    get_uni_float = [float(x) for x in uni_tfidf_values.split()]
    get_uni_list = str_uni_grams.split(',')
    unigram_dict = dict(zip(get_unigrams, get_uni_float))
    ###########################  

    ##GET TFIDF FOR UNIGRAMS##
    ############
    uni_avg_tfidf = (sum(map(float,get_uni_float)))/(len(get_uni_float))
    ###########################
    get_zip_str = [''.join(item) for item in str_unigrams]
    ###Unigrams string with TFIDF###
    unigrams_list =  zip(get_zip_str, get_uni_float)
    ###########################
    
    ##print '===============***==============='
   ## print 'Total Unigrams: ', len(fdist)
   ## print 'Total tfidf', total__tfidf
    ##print 'Average TF.IDF: ', uni_avg_tfidf
    ##print '===============***==============='
    ###########################
    ##### TFIDF FEATURE MATRIX #####
    uni_feat_tfidf = []
    for x in unigrams_list:
        if float(x[1]) > uni_avg_tfidf:
            uni_feat_tfidf.append(1)
        else:
            uni_feat_tfidf.append(0)
    zip_tfidf_feat = zip(get_zip_str, get_uni_float, uni_feat_tfidf)
    ##print zip_tfidf_feat
    ###############################
    ##### First Sentence Feat #####
    uni_fir_sen = []
    for x in unigrams_list:
        get_res = chk_frs_sen(x[0], file_name)
        if get_res == 1:
            uni_fir_sen.append(1)
        else:
            uni_fir_sen.append(0)
    zip_fir_sen_feat = zip(get_zip_str, get_uni_float, uni_feat_tfidf, uni_fir_sen)
    ############################
    ##### Involve in Title #####
    uni_title_feat = []
    for x in unigrams_list:
        get_res = involve_in_title(x[0], title)
        if get_res == 1:
            uni_title_feat.append(1)
        else:
            uni_title_feat.append(0)
    zip_uni_feats = zip(get_zip_str, get_uni_float, uni_feat_tfidf, uni_fir_sen, uni_title_feat)
    ##print zip_uni_feats
    ################################
    ##print "\n\n"
    ###################################GET BIGRAMS###################################
    ##print "BIGRAM -->"
    for k, s in enumerate(get_nouns):
        for match in finditer(get_bi_gram, s):
            x, y = match.span()
            ##print pos_tag[k][x:y]
            bag_of_biNP += pos_tag[k][x:y]
            
    ##Term Frequency for bigrams##
    total__tfidf = 0
    bi_tfidf_values = ''
    str_bi_grams = ''
    ###############
    ##STORE BIGRAMS
    unzipped = zip(*bag_of_biNP)
    str_bigrams = list(unzipped[0])
    get_bigrams = zip(str_bigrams,str_bigrams[1:])[::2]
    ###############
    
    ##print "\nBigram Feature Matrices:"
    bi_dist = nltk.FreqDist(bag_of_biNP)

    ##BI MAXIMUM TermScore##
    bi_scores = []
    for word in bi_dist:
        score = bi_dist[word]
        bi_scores.append(score)
    max_bi = max(bi_scores)
    ######################
    
    for word in bi_dist:
        tq_word = bi_dist[word]
        ##print '%s-->%d' % (word, tq_word)
        get_tf = term_frequency(tq_word, max_bi)
        
        ### FEATURES ###
        ##Tuple to String##
        to_string = ':'.join(word)
        get_this_string = convert_to_string(to_string)
        
        ##DF Score
        num_of_doc_word = count_nterm_doc(get_this_string)
        
        ##TF.IDF Score
        idf_score = inverse_df(total_docs, num_of_doc_word)
        tf_idf_scr = get_tf*idf_score
        total__tfidf += tf_idf_scr

        ##GET EACH BIGRAMS TFIDF
        get_tfidf_scr = repr(tf_idf_scr)+' '
        bi_tfidf_values += get_tfidf_scr
        str_bi_grams += get_this_string+','

    ##BUILD DICT FOR EACH TERMS
    get_float = [float(x) for x in bi_tfidf_values.split()]
    get_bi_list = str_bi_grams.split(',')
    bigram_dict = dict(zip(get_bi_list, get_float))
    ###########################
    
    ##GET TFIDF FOR BIGRAMS##
    get_bi_floats = get_val_bipairs(bigram_dict, get_bigrams)
    get_zip = dict(zip(get_bigrams, get_bi_floats))
    ############
    real_avg_tfidf = (sum(map(float,get_bi_floats)))/(len(get_bi_floats))
    ###########################
    get_zip_str = [' '.join(item) for item in get_bigrams]
    ###Bigrams string with TFIDF###
    bigrams_list =  zip(get_zip_str, get_bi_floats)
    ###########################
    ##print bigrams_list
    
    ##print '===============***==============='
    ##print 'Total Bigrams: ', len(get_bi_floats)
    ##print 'total tfidf: ', sum(map(float,get_bi_floats))
    ##print 'Average TF.IDF: ', real_avg_tfidf
    ##print '===============***==============='
    ##print len(bi_str2_float(bi_tfidf_values))
    ##print type(bag_of_biNP)
    ##### TFIDF FEATURE MATRIX #####
    feat_tfidf_matx = []
    for x in bigrams_list:
        if float(x[1]) > real_avg_tfidf:
            feat_tfidf_matx.append(1)
        else:
            feat_tfidf_matx.append(0)
            
    tfidf_feat = zip(get_zip_str, get_bi_floats, feat_tfidf_matx)
    #################################
    #### FIRST SENTENCE FEATURE ####
    feat_fir_sen = []
    for x in tfidf_feat:
        get_res = chk_frs_sen(x[0], file_name)
        if get_res == 1:
            feat_fir_sen.append(1)
        else:
            feat_fir_sen.append(0)
            
    fir_sen_feat = zip (get_zip_str, get_bi_floats, feat_tfidf_matx, feat_fir_sen)
    ##print fir_sen_feat
    #################################
    #### INVOLVE IN TITLE FEATURE ###
    feat_invol_tit = []
    for x in fir_sen_feat:
        get_res = involve_in_title(x[0], title)
        if get_res == 1:
            feat_invol_tit.append(1)
        else:
            feat_invol_tit.append(0)
    invol_tit_feat = zip (get_zip_str, get_bi_floats, feat_tfidf_matx, feat_fir_sen, feat_invol_tit)
    ##print invol_tit_feat
    #################################
    ##print "\n\n"
    
    ###################################GET TRIGRAMS###################################
    ##print "TRIGRAM -->"
    for k, s in enumerate(get_nouns):
        for match in finditer(get_tri_gram, s):
            x, y = match.span()
            ##print pos_tag[k][x:y]
            bag_of_triNP += pos_tag[k][x:y]
            
    #Term Frequency for trigrams
    total__tfidf = 0
    tri_tfidf_values = ''
    str_tri_grams = ''
    ###############
    
    ##STORE TRIGRAMS
    unzipped_tri = zip(*bag_of_triNP)
    str_trigrams = list(unzipped_tri[0])
    get_trigrams = zip(str_trigrams,str_trigrams[1:],str_trigrams[2:])[::3]
    ###############
    
    ##print "\nTrigram Feature Matrices:"
    tri_dist = nltk.FreqDist(bag_of_triNP)

    ##TRI MAXIMUM TermScore##
    tri_scores = []
    for word in tri_dist:
        score = tri_dist[word]
        tri_scores.append(score)
    max_tri = max(tri_scores)
    ######################
    for word in tri_dist:
        tr_fq = tri_dist[word]
        ##print '%s-->%d' % (word, tr_fq)
        get_tf = term_frequency(tr_fq, max_tri)
    
        ### FEATURES ###
        ##Tuple to String##
        to_string = ':'.join(word)
        get_this_string = convert_to_string(to_string)
        ##DF Score
        num_of_doc_word = count_nterm_doc(get_this_string)
        ##
        ##TF.IDF Score
        idf_score = inverse_df(total_docs, num_of_doc_word)
        tf_idf_scr = get_tf * idf_score
        total__tfidf += tf_idf_scr

        ##GET EACH TRIGRAMS TFIDF
        get_tfidf_scr = repr(tf_idf_scr)+' '
        tri_tfidf_values += get_tfidf_scr
        str_tri_grams += get_this_string+','

    ##BUILD DICT FOR EACH TERMS
    get_tri_float = [float(x) for x in tri_tfidf_values.split()]
    get_tri_list = str_tri_grams.split(',')
    trigram_dict = dict(zip(get_tri_list, get_tri_float))
    ###########################
    
    ##GET TFIDF FOR TRIGRAMS##
    get_tri_floats = get_val_tripairs(trigram_dict, get_trigrams)
    get_tri_zip = dict(zip(get_trigrams, get_tri_floats))
    ############
    tri_avg_tfidf = (sum(map(float,get_tri_floats)))/(len(get_tri_floats))
    ###########################
    get_ziptri_str = [' '.join(item) for item in get_trigrams]
    ###Bigrams string with TFIDF###
    trigrams_list =  zip(get_ziptri_str, get_tri_floats)
    ###########################
    ##print '===============***==============='
    ##print 'Total Trigrams: ', len(get_tri_floats)
    ##print 'Total tfidf', sum(map(float,get_tri_floats))
    ##print 'Average TF.IDF: ', tri_avg_tfidf
    ##print '===============***==============='
    ##### TFIDF FEATURE MATRIX #####
    tri_tfidf_matx = []
    for x in trigrams_list:
        if float(x[1]) > tri_avg_tfidf:
            tri_tfidf_matx.append(1)
        else:
            tri_tfidf_matx.append(0)
            
    tri_tfidf_feat = zip(get_ziptri_str, get_tri_floats, tri_tfidf_matx)
    ################################
    #### FIRST SENTENCE FEATURE ####
    tri_fir_sen = []
    for x in tri_tfidf_feat:
        get_res = chk_frs_sen(x[0], file_name)
        if get_res == 1:
            tri_fir_sen.append(1)
        else:
            tri_fir_sen.append(0)
            
    tri_sen_feat = zip (get_ziptri_str, get_tri_floats, tri_tfidf_matx, tri_fir_sen)
    #################################
    #### INVOLVE IN TITLE FEATURE ###
    tri_invol_tit = []
    for x in tri_sen_feat:
        get_res = involve_in_title(x[0], title)
        if get_res == 1:
            tri_invol_tit.append(1)
        else:
            tri_invol_tit.append(0)
    tri_tit_feat = zip (get_ziptri_str, get_tri_floats, tri_tfidf_matx, tri_fir_sen, tri_invol_tit)
    ##print tri_tit_feat
    #################################
    ##print "\n\n"

    ###################################GET 4-GRAMS###################################
    ##print "4th GRAM -->"
    for k, s in enumerate(get_nouns):
        for match in finditer(get_quard_gram, s):
            x,y = match.span()
            ##print pos_tag[k][x:y]
            bag_of_fourNP += pos_tag[k][x:y]

    #Term Frequency for 4-grams
    total__tfidf = 0
    four_tfidf_values = ''
    str_four_grams = ''
    ###############
    if (len(bag_of_fourNP)>0):
        
        ##STORE 4-GRAMS
        unzipped_four = zip(*bag_of_fourNP)
        str_fourgrams = list(unzipped_four[0])
        get_fourgrams = zip(str_fourgrams,str_fourgrams[1:],str_fourgrams[2:],str_fourgrams[3:])[::4]
        ###############

        #Term Frequency for 4-grams
        total__tfidf = 0
        ##print "\n4-grams Feature Matrices:"
        f_dist = nltk.FreqDist(bag_of_fourNP)

        ##4 MAXIMUM TermScore##
        four_scores = []
        for word in f_dist:
            score = f_dist[word]
            four_scores.append(score)
        max_four = max(four_scores)
        ######################
        for word in f_dist:
            fr_fq = f_dist[word]
            ##print '%s-->%d' % (word, fr_fq)
            get_tf = term_frequency(fr_fq, max_four)

            ### FEATURES ###
            ##Tuple to String##
            to_string = ':'.join(word)
            get_this_string = convert_to_string(to_string)
            ##DF Score
            num_of_doc_word = count_nterm_doc(get_this_string)
            ##
            ##TF.IDF Score
            idf_score = inverse_df(total_docs, num_of_doc_word)
            tf_idf_scr = get_tf * idf_score
            total__tfidf += tf_idf_scr

            ##GET EACH FOURGRAMS TFIDF
            get_tfidf_scr = repr(tf_idf_scr)+' '
            four_tfidf_values += get_tfidf_scr
            str_four_grams += get_this_string+','

        ##BUILD DICT FOR EACH TERMS
        get_four_float = [float(x) for x in four_tfidf_values.split()]
        get_four_list = str_four_grams.split(',')
        fourgram_dict = dict(zip(get_four_list, get_four_float))
        ###########################

        ##GET TFIDF FOR 4-GRAMS##
        get_four_floats = get_val_fpairs(fourgram_dict, get_fourgrams)
        get_four_zip = dict(zip(get_fourgrams, get_four_floats))
        ############
        four_avg_tfidf = (sum(map(float,get_four_floats)))/(len(get_four_floats))
        ###########################
        get_zipfour_str = [' '.join(item) for item in get_fourgrams]
        ###Bigrams string with TFIDF###
        fourgrams_list =  zip(get_zipfour_str, get_four_floats)
        ###########################
        ##print '===============***==============='
        ##print 'Total 4-grams: ', len(get_four_floats)
        ##print 'Total tfidf', sum(map(float,get_four_floats))
        ##print 'Average TF.IDF: ', four_avg_tfidf
        ##print '===============***==============='
        ##### TFIDF FEATURE MATRIX #####
        four_tfidf_matx = []
        for x in fourgrams_list:
            if float(x[1]) > four_avg_tfidf:
                four_tfidf_matx.append(1)
            else:
                four_tfidf_matx.append(0)
            
        four_tfidf_feat = zip(get_zipfour_str, get_four_floats, four_tfidf_matx)
        #################################
        #### FIRST SENTENCE FEATURE ####
        four_fir_sen = []
        for x in four_tfidf_feat:
            get_res = chk_frs_sen(x[0], file_name)
            if get_res == 1:
                four_fir_sen.append(1)
            else:
                four_fir_sen.append(0)
            
        four_sen_feat = zip (get_zipfour_str, get_four_floats, four_tfidf_matx, four_fir_sen)
        #################################
        #### INVOLVE IN TITLE FEATURE ###
        four_invol_tit = []
        for x in tri_sen_feat:
            get_res = involve_in_title(x[0], title)
            if get_res == 1:
                four_invol_tit.append(1)
            else:
                four_invol_tit.append(0)
        four_tit_feat = zip (get_zipfour_str, get_four_floats,four_tfidf_matx, four_fir_sen, four_invol_tit)
        ##print four_tit_feat
        #################################

    else:
        four_tit_feat = ''
        print 'Zero Fourgram\n'
      
    ##print zip_uni_feats, invol_tit_feat, tri_tit_feat, four_tit_feat
    ##print uni_avg_tfidf,real_avg_tfidf, tri_avg_tfidf,four_avg_tfidf
    key_unigram = cal_matrix(zip_uni_feats, uni_avg_tfidf,'uni_tf.txt','uni_fs.txt','uni_tit.txt')
    print '\n'
    key_bigram = cal_matrix(invol_tit_feat, real_avg_tfidf,'bi_tf.txt','bi_fs.txt','bi_tit.txt')
    print '\n'
    key_trigram = cal_tri_matrix(tri_tit_feat, tri_avg_tfidf,'tri_tf.txt','tri_fs.txt','tri_tit.txt')
    print '\n'
    if not four_tit_feat:
        print 'No 4-grams in document.'
        get_all_keywords = key_unigram + key_bigram + key_trigram
        print len(get_all_keywords),' keywords for total n-grams.'
        get_time = (time.time() - start_time)
        get_milli = get_time*1000
        print("--- %s seconds ---" % get_time) 
    else:
        key_four = cal_four_matrix(four_tit_feat, four_avg_tfidf,'four_tf.txt','four_fs.txt','four_tit.txt')
        ##get_all_keywords = key_unigram + key_bigram + key_trigram + key_four
        get_all_keywords = key_unigram + key_bigram + key_trigram + key_four
        print len(get_all_keywords),' keywords for total n-grams.'
        get_time = (time.time() - start_time)
        get_milli = get_time*1000
        print("--- %s seconds ---" % get_time)
    ##GET SUMMARY##
    summary(key_unigram, title, prettify_txt)
Example #27
0
def interpret_NeuronsQuery_MoreGeneral(self, match):
    #print "interpret_NeuronsQuery_MoreGeneral", match._words, match.words, match._particles
    neuron = IsNeuron() + HasClass('Neuron')

    # NOTE: "format" group overrides any "opener" group--for formatting
    #       e.g. "List neurons in Lamina as morphology" will use morphology formatting.
    if getattr(match, 'formatting', None):
        form_lems = match.formatting.lemmas.lower()
        if 'list' in form_lems or 'information' in form_lems:
            neuron = neuron + HasFormat('information')
        elif 'network' in form_lems:
            neuron = neuron + HasFormat('network')
            # NOTE: We don't even bother checking for "morphology", since that's assumed default.
    elif getattr(match, 'opener', None):
        form_lems = match.opener.lemmas.lower()
        if 'list' in form_lems:
            neuron = neuron + HasFormat('information')
        elif 'graph' in form_lems:
            neuron = neuron + HasFormat('network')

    if getattr(match, 'region_list', None):
        neuron = neuron + OwnedBy(
            get_region_owner(match.region_list, notneurons))
    if getattr(match, 'neuron_modifiers', None):
        mods = []
        # NOTE: The following assumes whitespace separates JJs / NNs:
        for m in finditer((Pos("JJ") | Pos("NN")), match.neuron_modifiers):
            i, j = m.span()
            mod_name = match.neuron_modifiers[i:j][0].token.lower()
            mods.append(mod_name)
        if len(mods) == 1:
            modifier = get_name_expression(mods[0])
            neuron = neuron + Has(modifier)
        elif len(mods) > 1:
            # NOTE: We assume this is a disjunction of modifiers for now
            andop = IsOrOp()
            for mod in mods:
                modifier = get_name_expression(mod)
                andop += HasPart(modifier)
            neuron += Has(andop)
    if getattr(match, "transmitters", None):
        mods = []
        # NOTE: The following assumes whitespace separates adjnouns:
        for m in finditer(adjnoun, match.transmitters):
            i, j = m.span()
            mod_name = match.transmitters[i:j][0].token.lower()
            mods.append(mod_name)
        if len(mods) == 1:
            modifier = get_name_expression(mods[0])
            neuron = neuron + Has(modifier)
        elif len(mods) > 1:
            # NOTE: We assume this is a disjunction of modifiers for now
            andop = IsOrOp()
            for mod in modifiers:
                modifier = get_name_expression(mod)
                andop += HasPart(modifier)
            neuron += Has(andop)
    if getattr(match, "neuron_name", None):
        # In keeping with the "spirit of the tree" (as described in the codegen file),
        # we put the neuron name in the neuron node if there's only 1 name;
        # We only need to create a 'has' node if there's more than 1 name (e.g. and, or).
        # For now, we only support one name to begin with.
        neuron = neuron + HasName(match.neuron_name.tokens)
    if getattr(match, "expressing_marker", None):
        marker = IsGeneticMarker() + HasName(match.expressing_marker.lemmas)
        neuron = neuron + HasGeneticMarker(marker)
    if getattr(match, "conn_quant", None):
        quantdir = IsNumConnections()
        # TODO: Don't use finditer; just do a search. Clean this up.
        for n in finditer(Pos("CD"), match.conn_quant):
            r, s = n.span()
            conn_num = ' '.join([c.token for c in match.conn_quant[r:s]])
            moreorless = False
            for o in finditer(Lemmas("more than"), match.conn_quant):
                quantdir = quantdir + HasMoreThan(conn_num)
                moreorless = True
            for o in finditer(Lemmas("less than"), match.conn_quant):
                quantdir = quantdir + HasLessThan(conn_num)
                moreorless = True
            if not moreorless:
                quantdir = quantdir + HasEqualTo(conn_num)
        for n in finditer(
                Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS"),
                match.conn_quant):
            r, s = n.span()
            conn_target = ' '.join([c.token for c in match.conn_quant[r:s]])
            # TODO: Make conn_target.lower() ?
            quantdir = quantdir + HasConnectionsTarget(conn_target)
        neuron = neuron + HasConnections(quantdir)
    # neuron_label = NameOf( neuron )
    # return neuron_label, "enum"
    return neuron, "enum"
Example #28
0
        def get_subquery(m, matchwords):
            neuron = IsNeuron() + HasClass('Neuron')
            owned_region = None
            global syn_num
            #print matchwords
            #print m.state
            if 'synapse_num_clause' in m:
                p, q = m['synapse_num_clause']
                conn_quant_words = matchwords[p:q]
                # TODO: Perform a search instead of finditer
                for n in finditer(Pos("CD"), conn_quant_words):
                    r, s = n.span()
                    conn_num = ' '.join(
                        [c.token for c in conn_quant_words[r:s]])
                    moreorless = False  # See above...
                    for o in finditer(Lemmas("more than"), conn_quant_words):
                        syn_num = HasMoreThan(conn_num)
                        moreorless = True
                    for o in finditer(Lemmas("less than"), conn_quant_words):
                        syn_num = HasLessThan(conn_num)
                        moreorless = True
                    for o in finditer(Lemma("atleast"), conn_quant_words):
                        syn_num = HasAtLeast(conn_num)
                        moreorless = True
                    for o in finditer(Lemma("atmost"), conn_quant_words):
                        syn_num = HasAtMost(conn_num)
                        moreorless = True
                    for o in finditer(Lemmas("at least"), conn_quant_words):
                        syn_num = HasAtLeast(conn_num)
                        moreorless = True
                    for o in finditer(Lemmas("at most"), conn_quant_words):
                        syn_num = HasAtMost(conn_num)
                        moreorless = True

                    if not moreorless:
                        syn_num = HasEqualTo(conn_num)
                #print "syn_num", syn_num
            # The (sub-)subquery which this subquery is "(pre-/post-)synaptic to".
            synaptic_to = None
            if 'synaptic_phrase' in m:
                p, q = m['synaptic_phrase']
                synaptic_phrase = matchwords[p:q]

                # TODO: Clean this up.
                spl = [w.lemma.lower() for w in synaptic_phrase]
                to_idx = spl.index('to')
                syn_type = set()
                or_syns = False
                if 'presynaptic' in spl[:to_idx]:
                    syn_type.add('presynaptic')
                if 'postsynaptic' in spl[:to_idx]:
                    syn_type.add('postsynaptic')
                if 'or' in spl[:to_idx]:
                    or_syns = True

                # NOTE: We currently only support one subquery here, anyway.
                for n in finditer(self.subquery, synaptic_phrase[to_idx + 1:]):
                    r, s = n.span()
                    synaptic_to, _ = get_subquery(n,
                                                  synaptic_phrase[to_idx + 1:])
                if 'presynaptic' in syn_type:
                    neuron += PresynapticTo(synaptic_to)
                elif 'postsynaptic' in syn_type:
                    neuron += PostsynapticTo(synaptic_to)
                if syn_num: neuron += syn_num
                # This is basically just a trick to update the existing ("parent") subquery.
                # TODO: Clean this up.
                for m in finditer(self.subquery, matchwords[:p]):
                    break

            if 'region_list' in m:
                p, q = m['region_list']
                owned_region = get_region_owner(
                    matchwords[p:q], Predicate(lowercase_is_in(regions)))
                neuron = neuron + OwnedBy(owned_region)

            # We identify transmitters and neuron types with the "has" relation (e.g. in the SAST)
            # so to support conjunctions/disjunctions of these modifiers, while also keeping the SAST
            # "simple" with at most one "has" relation per node, we calculate the "has" relations later
            has_modifiers = []
            if 'neuron_modifiers' in m:
                p, q = m['neuron_modifiers']
                modifiers_words = [x for x in matchwords[p:q] if x.pos != ',']

                has_modifiers.append(
                    build_mod_tree(modifiers_words, synaptic_to))
            if 'transmitters' in m:
                p, q = m['transmitters']
                modifiers_words = [x for x in matchwords[p:q] if x.pos != ',']

                has_modifiers.append(
                    build_mod_tree(modifiers_words, synaptic_to))
            if 'neurons' in m:
                p, q = m['neurons']
                # NOTE: We assume that this can only be "interneuron(s)" or "neuron(s)"
                if 'interneuron' in ''.join([x.lemma
                                             for x in matchwords[p:q]]):
                    has_modifiers.append(IsAttribute() + HasKey('locality') +
                                         HasValue('True'))
            else:
                # NOTE: For now, we assume that a neuron 'name/type' (and not "neuron") is present
                # for n in finditer( Pos("CD"), conn_quant_words ):
                pass
            if 'expressing_marker' in m:
                p, q = m['expressing_marker']
                expressing_lemmas = [x.lemma for x in matchwords[p:q]]

                # This is just a temporary solution--before support for genetic markers is added.
                marker = IsGeneticMarker() + HasName(
                    ' '.join(expressing_lemmas))
                neuron = neuron + HasGeneticMarker(marker)
                # TODO: Include this as a 'has' relation (as above)?
            if 'conn_quant' in m:
                # NOTE: This is currently unused by the code generator
                p, q = m['conn_quant']
                conn_quant_words = matchwords[p:q]

                quantdir = IsNumConnections()
                # TODO: Perform a search instead of finditer
                for n in finditer(Pos("CD"), conn_quant_words):
                    r, s = n.span()
                    conn_num = ' '.join(
                        [c.token for c in conn_quant_words[r:s]])
                    moreorless = False  # See above...
                    for o in finditer(Lemmas("more than"), conn_quant_words):
                        quantdir = quantdir + HasMoreThan(conn_num)
                        moreorless = True
                    for o in finditer(Lemmas("less than"), conn_quant_words):
                        quantdir = quantdir + HasLessThan(conn_num)
                        moreorless = True
                    if not moreorless:
                        quantdir = quantdir + HasEqualTo(conn_num)
                for n in finditer(
                        Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS"),
                        conn_quant_words):
                    r, s = n.span()
                    conn_target = ' '.join(
                        [c.token for c in conn_quant_words[r:s]])
                    # TODO: Make conn_target.lower() ?
                    quantdir = quantdir + HasConnectionsTarget(conn_target)
                neuron = neuron + HasConnections(quantdir)
            if 'connections_clause' in m:
                p, q = m['connections_clause']
                connections_words = [
                    x for x in matchwords[p:q] if x.pos != ','
                ]
                connectives = []
                segments = [[]]
                seg_idx = 0
                for word in connections_words:
                    if word.lemma.lower() == 'and':
                        connectives.append('and')
                        segments.append([])
                        seg_idx += 1
                        continue
                    if word.lemma.lower() == 'or':
                        connectives.append('or')
                        segments.append([])
                        seg_idx += 1
                        continue
                    segments[seg_idx].append(word)
                # NOTE: We assume that no "segment" list is empty--based on our grammar
                # Scan from left to right in 'connections_clause'
                last_conn_type = None
                region_name = None
                connection_nodes = []
                for segment in segments:
                    # NOTE: The order of these loops (which shouldn't be loops) currently matters.
                    # NOTE: We assume no region has these terms in their name/synonyms.
                    # TODO: Clean this up.
                    for n in finditer(
                            Lemma('connection') | Lemma('process')
                            | Lemma('arborization') | Lemma('arborizations')
                            | Lemma('arborize') | Lemma('innervate')
                            | Lemma('innervation'), segment):
                        last_conn_type = 'arbors'
                        break
                    for n in finditer(
                            Lemma('dendrite') | Lemma('input')
                            | Lemma('dendritic'), segment):
                        last_conn_type = 'dendriteArbors'
                        break
                    for n in finditer(
                            Lemma('axon') | Lemma('axonal') | Lemma('axonic')
                            | Lemma('output'), segment):
                        last_conn_type = 'axonArbors'
                        break

                    for n in finditer(Predicate(lowercase_is_in(arbregions)),
                                      segment):
                        r, s = n.span()
                        region_name = ' '.join(
                            [comp.token for comp in segment[r:s]]).lower()
                        if region_name not in arbregions:
                            log.error('Unknown region name: ' + region_name)
                            # TODO: Handle gracefully.
                        else:
                            region_name = arbregions[region_name]
                            # NOTE: We assume there's exactly one region per segment
                    # NOTE: We assume last_conn_type was set at least initially--based on our grammar
                    conn_region = HasType(last_conn_type)
                    conn_region += HasRegion(region_name)

                    connection_nodes.append(conn_region)

                # NOTE: We assume there is at least one element in connection_nodes
                # NOTE: We assume the number of connectives is one less than len(connection_nodes)
                if len(connection_nodes) == 1:
                    has_modifiers.append(connection_nodes[0])
                else:
                    connective = None
                    if connectives.pop(0) == 'and':
                        connective = IsAndOp() + HasPart(connection_nodes.pop(0)) \
                                     + HasPart(connection_nodes.pop(0))
                    else:
                        connective = IsOrOp() + HasPart(connection_nodes.pop(0)) \
                                     + HasPart(connection_nodes.pop(0))

                    while len(connectives) > 0:
                        if connectives.pop(0) == 'and':
                            connective = IsAndOp() + HasPart(connective) \
                                         + HasPart(connection_nodes.pop(0))
                        else:
                            connective = IsOrOp() + HasPart(connective) \
                                         + HasPart(connection_nodes.pop(0))
                    has_modifiers.append(connective)
            if 'is_connecting' in m:
                p, q = m['is_connecting']
                region_pair = []
                for n in finditer(Predicate(lowercase_is_in(arbregions)),
                                  matchwords[p:q]):
                    r, s = n.span()
                    r, s = r + p, s + p  # Get the offset from matchwords
                    region_name = ' '.join(
                        [comp.token for comp in matchwords[r:s]]).lower()
                    if region_name not in arbregions:
                        log.error('Unknown region name: ' + region_name)
                        # TODO: Handle gracefully
                    else:
                        region_pair.append(arbregions[region_name])
                # Check that there were exactly two regions. NOTE: This could be enforced by the grammar.
                if len(region_pair) == 2:
                    # NOTE: We assume the first region we parse is the "from" region.
                    if 'and' in [x.lemma for x in matchwords[p:q]]:
                        conn1 = IsConnection() + FromRegion(
                            region_pair[0]) + ToRegion(region_pair[1])
                        conn2 = IsConnection() + FromRegion(
                            region_pair[1]) + ToRegion(region_pair[0])
                        connecting_node = IsOrOp() + HasPart(conn1) + HasPart(
                            conn2)
                    else:
                        # NOTE: We assume 'to' is present
                        connecting_node = IsConnection() + FromRegion(
                            region_pair[0]) + ToRegion(region_pair[1])
                    # NOTE: At least for now, we'll put connections in with the 'has' relations
                    #       but NOTE that this only works if codegen_optimization to true.
                    # neuron += Connecting( connecting_node )
                    has_modifiers.append(connecting_node)
            # Now create a single "has node" for this subquery's neuron.
            if len(has_modifiers) > 1:
                # NOTE: We assume all 'has' objects are "conjuncted together".
                has_node = IsAndOp()
                for mod in has_modifiers:
                    has_node += HasPart(mod)
                neuron += Has(has_node)
            elif len(has_modifiers) == 1:
                neuron += Has(has_modifiers[0])

            return neuron, owned_region
Example #29
0
                                 "... sort of.")
parser.add_argument("filename", action="store")
cfg = parser.parse_args()
text = open(cfg.filename).read()

from refo import finditer, Predicate, Literal, Any, Group, Star


def notin(xs):
    return lambda x: x not in xs


name = Predicate(notin("/")) + Star(Predicate(notin(" >")))
name = Group(name, "name")
inside = name + Star(Any(), greedy=False)
opentag = Literal("<") + inside + Literal(">")
opentag = Group(opentag, "open")
closetag = Literal("<") + Literal("/") + inside + Literal(">")
closetag = Group(closetag, "close")
regex = closetag | opentag

depth = 0
for m in finditer(regex, text):
    if "open" in m:
        i, j = m["name"]
        print("  " * depth + text[i:j])
        depth += 1
    else:
        assert "close" in m
        depth -= 1
Example #30
0
def interpret_NeuronsQuery_MoreSpecific(self, match):
    # NOTE: If a subquery has a prepositional phrase attached (e.g. "in [regions]"),
    #       then we should see if the preceding subqueries lack a prepositional phrase.
    #       By default, attach the prep. phrase to the preceding subqueries as well.
    #       But we'd prefer to alert the user and have them check this.
    # subquery_list is a list of tuples, where the first element is the Expression tree (SAST)
    # and the second element contains the sub-tree corresponding to any owned_by region(s)
    #print "interpret_NeuronsQuery_MoreSpecific", match._words, match.words, match._particles
    global syn_num
    syn_num = None
    subquery_list = []
    for mtch in finditer(self.subquery, match.words):
        i, j = mtch.span()

        #for x in mtch.state:
        #    print x, mtch.state[x]
        def get_subquery(m, matchwords):
            neuron = IsNeuron() + HasClass('Neuron')
            owned_region = None
            global syn_num
            #print matchwords
            #print m.state
            if 'synapse_num_clause' in m:
                p, q = m['synapse_num_clause']
                conn_quant_words = matchwords[p:q]
                # TODO: Perform a search instead of finditer
                for n in finditer(Pos("CD"), conn_quant_words):
                    r, s = n.span()
                    conn_num = ' '.join(
                        [c.token for c in conn_quant_words[r:s]])
                    moreorless = False  # See above...
                    for o in finditer(Lemmas("more than"), conn_quant_words):
                        syn_num = HasMoreThan(conn_num)
                        moreorless = True
                    for o in finditer(Lemmas("less than"), conn_quant_words):
                        syn_num = HasLessThan(conn_num)
                        moreorless = True
                    for o in finditer(Lemma("atleast"), conn_quant_words):
                        syn_num = HasAtLeast(conn_num)
                        moreorless = True
                    for o in finditer(Lemma("atmost"), conn_quant_words):
                        syn_num = HasAtMost(conn_num)
                        moreorless = True
                    for o in finditer(Lemmas("at least"), conn_quant_words):
                        syn_num = HasAtLeast(conn_num)
                        moreorless = True
                    for o in finditer(Lemmas("at most"), conn_quant_words):
                        syn_num = HasAtMost(conn_num)
                        moreorless = True

                    if not moreorless:
                        syn_num = HasEqualTo(conn_num)
                #print "syn_num", syn_num
            # The (sub-)subquery which this subquery is "(pre-/post-)synaptic to".
            synaptic_to = None
            if 'synaptic_phrase' in m:
                p, q = m['synaptic_phrase']
                synaptic_phrase = matchwords[p:q]

                # TODO: Clean this up.
                spl = [w.lemma.lower() for w in synaptic_phrase]
                to_idx = spl.index('to')
                syn_type = set()
                or_syns = False
                if 'presynaptic' in spl[:to_idx]:
                    syn_type.add('presynaptic')
                if 'postsynaptic' in spl[:to_idx]:
                    syn_type.add('postsynaptic')
                if 'or' in spl[:to_idx]:
                    or_syns = True

                # NOTE: We currently only support one subquery here, anyway.
                for n in finditer(self.subquery, synaptic_phrase[to_idx + 1:]):
                    r, s = n.span()
                    synaptic_to, _ = get_subquery(n,
                                                  synaptic_phrase[to_idx + 1:])
                if 'presynaptic' in syn_type:
                    neuron += PresynapticTo(synaptic_to)
                elif 'postsynaptic' in syn_type:
                    neuron += PostsynapticTo(synaptic_to)
                if syn_num: neuron += syn_num
                # This is basically just a trick to update the existing ("parent") subquery.
                # TODO: Clean this up.
                for m in finditer(self.subquery, matchwords[:p]):
                    break

            if 'region_list' in m:
                p, q = m['region_list']
                owned_region = get_region_owner(
                    matchwords[p:q], Predicate(lowercase_is_in(regions)))
                neuron = neuron + OwnedBy(owned_region)

            # We identify transmitters and neuron types with the "has" relation (e.g. in the SAST)
            # so to support conjunctions/disjunctions of these modifiers, while also keeping the SAST
            # "simple" with at most one "has" relation per node, we calculate the "has" relations later
            has_modifiers = []
            if 'neuron_modifiers' in m:
                p, q = m['neuron_modifiers']
                modifiers_words = [x for x in matchwords[p:q] if x.pos != ',']

                has_modifiers.append(
                    build_mod_tree(modifiers_words, synaptic_to))
            if 'transmitters' in m:
                p, q = m['transmitters']
                modifiers_words = [x for x in matchwords[p:q] if x.pos != ',']

                has_modifiers.append(
                    build_mod_tree(modifiers_words, synaptic_to))
            if 'neurons' in m:
                p, q = m['neurons']
                # NOTE: We assume that this can only be "interneuron(s)" or "neuron(s)"
                if 'interneuron' in ''.join([x.lemma
                                             for x in matchwords[p:q]]):
                    has_modifiers.append(IsAttribute() + HasKey('locality') +
                                         HasValue('True'))
            else:
                # NOTE: For now, we assume that a neuron 'name/type' (and not "neuron") is present
                # for n in finditer( Pos("CD"), conn_quant_words ):
                pass
            if 'expressing_marker' in m:
                p, q = m['expressing_marker']
                expressing_lemmas = [x.lemma for x in matchwords[p:q]]

                # This is just a temporary solution--before support for genetic markers is added.
                marker = IsGeneticMarker() + HasName(
                    ' '.join(expressing_lemmas))
                neuron = neuron + HasGeneticMarker(marker)
                # TODO: Include this as a 'has' relation (as above)?
            if 'conn_quant' in m:
                # NOTE: This is currently unused by the code generator
                p, q = m['conn_quant']
                conn_quant_words = matchwords[p:q]

                quantdir = IsNumConnections()
                # TODO: Perform a search instead of finditer
                for n in finditer(Pos("CD"), conn_quant_words):
                    r, s = n.span()
                    conn_num = ' '.join(
                        [c.token for c in conn_quant_words[r:s]])
                    moreorless = False  # See above...
                    for o in finditer(Lemmas("more than"), conn_quant_words):
                        quantdir = quantdir + HasMoreThan(conn_num)
                        moreorless = True
                    for o in finditer(Lemmas("less than"), conn_quant_words):
                        quantdir = quantdir + HasLessThan(conn_num)
                        moreorless = True
                    if not moreorless:
                        quantdir = quantdir + HasEqualTo(conn_num)
                for n in finditer(
                        Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS"),
                        conn_quant_words):
                    r, s = n.span()
                    conn_target = ' '.join(
                        [c.token for c in conn_quant_words[r:s]])
                    # TODO: Make conn_target.lower() ?
                    quantdir = quantdir + HasConnectionsTarget(conn_target)
                neuron = neuron + HasConnections(quantdir)
            if 'connections_clause' in m:
                p, q = m['connections_clause']
                connections_words = [
                    x for x in matchwords[p:q] if x.pos != ','
                ]
                connectives = []
                segments = [[]]
                seg_idx = 0
                for word in connections_words:
                    if word.lemma.lower() == 'and':
                        connectives.append('and')
                        segments.append([])
                        seg_idx += 1
                        continue
                    if word.lemma.lower() == 'or':
                        connectives.append('or')
                        segments.append([])
                        seg_idx += 1
                        continue
                    segments[seg_idx].append(word)
                # NOTE: We assume that no "segment" list is empty--based on our grammar
                # Scan from left to right in 'connections_clause'
                last_conn_type = None
                region_name = None
                connection_nodes = []
                for segment in segments:
                    # NOTE: The order of these loops (which shouldn't be loops) currently matters.
                    # NOTE: We assume no region has these terms in their name/synonyms.
                    # TODO: Clean this up.
                    for n in finditer(
                            Lemma('connection') | Lemma('process')
                            | Lemma('arborization') | Lemma('arborizations')
                            | Lemma('arborize') | Lemma('innervate')
                            | Lemma('innervation'), segment):
                        last_conn_type = 'arbors'
                        break
                    for n in finditer(
                            Lemma('dendrite') | Lemma('input')
                            | Lemma('dendritic'), segment):
                        last_conn_type = 'dendriteArbors'
                        break
                    for n in finditer(
                            Lemma('axon') | Lemma('axonal') | Lemma('axonic')
                            | Lemma('output'), segment):
                        last_conn_type = 'axonArbors'
                        break

                    for n in finditer(Predicate(lowercase_is_in(arbregions)),
                                      segment):
                        r, s = n.span()
                        region_name = ' '.join(
                            [comp.token for comp in segment[r:s]]).lower()
                        if region_name not in arbregions:
                            log.error('Unknown region name: ' + region_name)
                            # TODO: Handle gracefully.
                        else:
                            region_name = arbregions[region_name]
                            # NOTE: We assume there's exactly one region per segment
                    # NOTE: We assume last_conn_type was set at least initially--based on our grammar
                    conn_region = HasType(last_conn_type)
                    conn_region += HasRegion(region_name)

                    connection_nodes.append(conn_region)

                # NOTE: We assume there is at least one element in connection_nodes
                # NOTE: We assume the number of connectives is one less than len(connection_nodes)
                if len(connection_nodes) == 1:
                    has_modifiers.append(connection_nodes[0])
                else:
                    connective = None
                    if connectives.pop(0) == 'and':
                        connective = IsAndOp() + HasPart(connection_nodes.pop(0)) \
                                     + HasPart(connection_nodes.pop(0))
                    else:
                        connective = IsOrOp() + HasPart(connection_nodes.pop(0)) \
                                     + HasPart(connection_nodes.pop(0))

                    while len(connectives) > 0:
                        if connectives.pop(0) == 'and':
                            connective = IsAndOp() + HasPart(connective) \
                                         + HasPart(connection_nodes.pop(0))
                        else:
                            connective = IsOrOp() + HasPart(connective) \
                                         + HasPart(connection_nodes.pop(0))
                    has_modifiers.append(connective)
            if 'is_connecting' in m:
                p, q = m['is_connecting']
                region_pair = []
                for n in finditer(Predicate(lowercase_is_in(arbregions)),
                                  matchwords[p:q]):
                    r, s = n.span()
                    r, s = r + p, s + p  # Get the offset from matchwords
                    region_name = ' '.join(
                        [comp.token for comp in matchwords[r:s]]).lower()
                    if region_name not in arbregions:
                        log.error('Unknown region name: ' + region_name)
                        # TODO: Handle gracefully
                    else:
                        region_pair.append(arbregions[region_name])
                # Check that there were exactly two regions. NOTE: This could be enforced by the grammar.
                if len(region_pair) == 2:
                    # NOTE: We assume the first region we parse is the "from" region.
                    if 'and' in [x.lemma for x in matchwords[p:q]]:
                        conn1 = IsConnection() + FromRegion(
                            region_pair[0]) + ToRegion(region_pair[1])
                        conn2 = IsConnection() + FromRegion(
                            region_pair[1]) + ToRegion(region_pair[0])
                        connecting_node = IsOrOp() + HasPart(conn1) + HasPart(
                            conn2)
                    else:
                        # NOTE: We assume 'to' is present
                        connecting_node = IsConnection() + FromRegion(
                            region_pair[0]) + ToRegion(region_pair[1])
                    # NOTE: At least for now, we'll put connections in with the 'has' relations
                    #       but NOTE that this only works if codegen_optimization to true.
                    # neuron += Connecting( connecting_node )
                    has_modifiers.append(connecting_node)
            # Now create a single "has node" for this subquery's neuron.
            if len(has_modifiers) > 1:
                # NOTE: We assume all 'has' objects are "conjuncted together".
                has_node = IsAndOp()
                for mod in has_modifiers:
                    has_node += HasPart(mod)
                neuron += Has(has_node)
            elif len(has_modifiers) == 1:
                neuron += Has(has_modifiers[0])

            return neuron, owned_region

        subquery_list.append(get_subquery(mtch, match.words))

    # We could attach the prep. phrases (e.g. "in [regions]") to previous subqueries
    # only if they don't already have their own prep. phrase.
    """
    subquery_list = subquery_list[::-1]
    prev_ownedby = subquery_list[0][1]
    for i, (subq, ownedby) in enumerate( subquery_list ):
        if prev_ownedby is not None:
            if ownedby is None:
                # TODO: Check that Python is okay with these 'is's and 'not's.
                subquery_list[i][0] += OwnedBy( prev_ownedby )
        else:
            prev_ownedby = ownedby
    """

    if len(subquery_list) == 1:
        final_query = subquery_list[0][0]
    else:
        # NOTE: We currently assume set union across subqueries
        final_query = IsOrOp()
        # NOTE: If prep. phrase attaching, ownedby data should be considered stale at this point;
        #       queries themselves would have been updated with "owned_by" relation info.
        for subq, ownedby in subquery_list:
            final_query += HasPart(subq)

    formatting = None
    # NOTE: We parse queries with an opener for each subquery, but currently only use the last
    if getattr(match, 'opener', None):
        form_lems = match.opener.lemmas
        if 'add' in form_lems:
            final_query += HasVerb('add')
        elif 'remove' in form_lems:
            final_query += HasVerb('remove')
        elif 'keep' in form_lems or 'retain' in form_lems:
            final_query += HasVerb('keep')
        elif 'list' in form_lems:
            formatting = 'information'
        elif 'graph' in form_lems:
            formatting = 'network'
        elif 'unpin' in form_lems:
            final_query += HasVerb('unpin')
        elif 'pin' in form_lems:
            final_query += HasVerb('pin')
        elif 'uncolor' in form_lems:
            final_query += HasVerb('uncolor')
        elif 'color' in form_lems:
            final_query += HasVerb('color')
            # NOTE: We only check for colors if 'color' is the verb
            if getattr(match, 'color', None):
                hue = match.color.lemmas
                if hue in colors_values:
                    hue = colors_values[hue]
                else:
                    # It's hex for a color
                    if hue.startswith('#'):
                        hue = hue[1:]
                    # NOTE: We assume right-most bit of given hex is LSB
                    hue = '0' * (6 - len(hue)) + hue
                final_query += HasColor(hue)
        # Not exactly natural language...
        elif 'unanimate' in form_lems or 'unblink' in form_lems:
            final_query += HasVerb('unblink')
        elif 'animate' in form_lems or 'blink' in form_lems:
            final_query += HasVerb('blink')
        elif 'unhide' in form_lems:
            final_query += HasVerb('unhide')
        elif 'hide' in form_lems:
            final_query += HasVerb('hide')
        if 'reveal' in form_lems:
            final_query += HasVerb('reveal')
    # NOTE: "format" group overrides any "opener" group--for formatting
    #       e.g. "List neurons in Lamina as morphology" will use morphology formatting.
    # TODO: What about 'show gabaergic neurons as? [color]' or 'as? [blinking]' ?
    if getattr(match, 'formatting', None):
        form_lems = match.formatting.lemmas
        if 'list' in form_lems or 'information' in form_lems:
            formatting = 'information'
        elif 'network' in form_lems:
            formatting = 'network'
        elif 'morphology' in form_lems:
            formatting = 'morphology'

    if formatting:
        final_query += HasFormat(formatting)

    return final_query, "enum"
Example #31
0
def main():
    get_total = count_total_corpus()
    count = 0
    f_name = str(count+1)
    
    uni_collection = []
    bi_collection = []
    tri_collection = []
    four_collection = []
    
    while (count < get_total):
        n_files = str(count+1)
        get_doc = open('traindata/doc'+n_files+'.txt', 'r')
        raw_doc = get_doc.read()

        ##Extract title##
        title = get_title(raw_doc)
        ##Extract First&Last Sentence##
        fir_sen = get_first_sen(raw_doc)
        last_sen = get_last_sen(raw_doc)
        get_last = last_sen.split(',')
        get_length = len(get_last)
        #### KEYWORD SECTION ####
        x=0

        key_unigram = ''
        key_bigram = ''
        key_trigram = ''
        key_fourgram = ''
        key_unknown = ''
        
        while (x<get_length):
            get_len = len(get_last[x].split())
            if (get_len == 1):
                key_unigram += get_last[x]+','
            elif (get_len == 2):
                key_bigram += get_last[x]+','
            elif (get_len == 3):
                key_trigram += get_last[x]+','
            elif (get_len == 4):
                key_fourgram += get_last[x]+','
            else:
                key_unknown += get_last[x]+','
            x += 1
            
        ### GET IN LIST ###
        key_unis = key_unigram.split(',')
        key_bis = key_bigram.split(',')
        key_tris = key_trigram.split(',')
        key_fours = key_fourgram.split(',')
        key_uns = key_unknown.split(',')
        ##print key_unis, key_bis, key_tris, key_fours, key_uns
            
        get_content = raw_doc.splitlines()[1:] #List form
        after_last_sen = get_content[:-1]
        content_str = ''.join(after_last_sen) #content in String format
        
        prettify_txt = re.sub(r'[^\w.]',' ', content_str)
        ##mod_txt = remov_stopword(prettify_txt)
        token_txt = nltk.sent_tokenize(prettify_txt)
        ##Number of Sentence: len(token_txt)##
        token_word = [nltk.word_tokenize(sent) for sent in token_txt]
        pos_tag = [nltk.pos_tag(sent) for sent in token_word]

        
        ##Chunking and printing  NP##
        get_nouns = [[Word(*x) for x in sent] for sent in pos_tag]
        ##NNP Rules##
        rule_0 = W(pos = "NNS")| W(pos = "NNS")| W(pos = "NN") | W(pos = "NNP")
        rule_05 = W(pos = "NNP") + W(pos = "NNS")
        rule_1 = W(pos = "WP$") + W(pos = "NNS")
        rule_2 = W(pos = "CD") + W(pos = "NNS")
        rule_3 = W(pos = "NN") + W(pos = "NN")
        rule_4 = W(pos = "NN") + W(pos = "NNS")
        rule_5 = W(pos = "NNP") + W(pos = "CD")
        rule_6 = W(pos = "NNP") + W(pos = "NNP")
        rule_7 = W(pos = "NNP") + W(pos = "NNPS")
        rule_8 = W(pos = "NNP") + W(pos = "NN")
        rule_9 = W(pos = "NNP") + W(pos = "VBZ")
        rule_10 = W(pos = "DT") + W(pos = "NNS")
        rule_11 = W(pos = "DT") + W(pos = "NN")
        rule_12 = W(pos = "DT") + W(pos = "NNP")
        rule_13 = W(pos = "JJ") + W(pos = "NN")
        rule_14 = W(pos = "JJ") + W(pos = "NNS")
        rule_15 = W(pos = "PRP$") + W(pos = "NNS")
        rule_16 = W(pos = "PRP$") + W(pos = "NN")
        rule_02 = W(pos = "NN") + W(pos = "NN") + W(pos = "NN")
        rule_17 = W(pos = "NN") + W(pos = "NNS") + W(pos = "NN")
        rule_18 = W(pos = "NNP") + W(pos = "NNP") + W(pos = "NNP")
        rule_19 = W(pos = "JJ") + W(pos = "NN") + W(pos = "NNS")
        rule_20 = W(pos = "PRP$") + W(pos = "NN") + W(pos = "NN")
        rule_21 = W(pos = "DT") + W(pos = "JJ") + W(pos = "NN")
        rule_22 = W(pos = "DT") + W(pos = "CD") + W(pos = "NNS")
        rule_23 = W(pos = "DT") + W(pos = "VBG") + W(pos = "NN")
        rule_24 = W(pos = "DT") + W(pos = "NN") + W(pos = "NN")
        rule_25 = W(pos = "NNP") + W(pos = "NNP") + W(pos = "VBZ")
        rule_26 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NN")
        rule_27 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NNP")
        rule_28 = W(pos = "DT") + W(pos = "JJ") + W(pos = "NN")
        rule_29 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NNP") + W(pos = "NNP")
        rule_30 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NN") + W(pos = "NN") 

        NP_bi_gram_set = (rule_05)|(rule_1)|(rule_2)|(rule_3)|(rule_4)|(rule_5)|(rule_6)|(rule_7)|(rule_8)|(rule_9)|(rule_10)|(rule_11)|(rule_12)|(rule_13)|(rule_14)|(rule_15)|(rule_16)
        NP_tri_gram_set = (rule_02)|(rule_17)|(rule_18)|(rule_19)|(rule_20)|(rule_21)|(rule_22)|(rule_23)|(rule_24)|(rule_25)|(rule_26)|(rule_27)|(rule_28)
        NP_quard_gram_set = (rule_29)|(rule_30)

        #Rule set function
        get_uni_gram = (rule_0)
        get_bi_gram = NP_bi_gram_set
        get_tri_gram = NP_tri_gram_set
        get_quard_gram = NP_quard_gram_set

        bag_of_NP = []
        bag_of_biNP = []
        bag_of_triNP = []
        bag_of_fourNP = []

        total__tfidf = 0
        #######################
        for k, s in enumerate(get_nouns):
            for match in finditer(get_uni_gram, s):
                x, y = match.span() #the match spans x to y inside the sentence s
                ##print pos_tag[k][x:y]
                bag_of_NP += pos_tag[k][x:y]
        for k, s in enumerate(get_nouns):
            for match in finditer(get_bi_gram, s):
                x, y = match.span()
                ##print pos_tag[k][x:y]
                bag_of_biNP += pos_tag[k][x:y]
        for k, s in enumerate(get_nouns):
            for match in finditer(get_tri_gram, s):
                x, y = match.span()
                ##print pos_tag[k][x:y]
                bag_of_triNP += pos_tag[k][x:y]
        for k, s in enumerate(get_nouns):
            for match in finditer(get_quard_gram, s):
                x,y = match.span()
                ##print pos_tag[k][x:y]
                bag_of_fourNP += pos_tag[k][x:y]

        ##### GETTING EACH WORD TFIDF #####
        uni_tfidf_values = ''
        str_uni_grams = ''
        total_docs = count_total_corpus()
        fdist = nltk.FreqDist(bag_of_NP)

        unzip_unigram = zip(*bag_of_NP)
        str_unigrams = list(unzip_unigram[0])
        
        ##UNI MAXIMUM TermScore##
        scores = []
        for word in fdist:
            score = fdist[word]
            scores.append(score)
        max_uni = max(scores)
        ######################
        
        for word in fdist:
            fq_word = fdist[word]
            get_tf = term_frequency(fq_word, max_uni)

            to_string = ':'.join(word)
            get_this_string = convert_to_string(to_string)

            num_of_doc_word = count_nterm_doc(get_this_string)
            idf_score = inverse_df(total_docs, num_of_doc_word)

            tf_idf_scr = get_tf * idf_score
            total__tfidf += tf_idf_scr

            uni_tfidf_scr = repr(tf_idf_scr)+' '
            uni_tfidf_values += uni_tfidf_scr
            str_uni_grams += get_this_string+','

        get_uni_float = [float(x) for x in uni_tfidf_values.split()]
        get_uni_list = str_uni_grams.split(',')
        unigram_dict = dict(zip(get_uni_list, get_uni_float))
        
        ##### GET TFIDF FOR UNIGRAMS & AVERAGE TFIDF VALUES #####
        uni_avg_tfidf = (sum(map(float, get_uni_float)))/(len(get_uni_float))
        get_zip_str = [''.join(item) for item in str_unigrams]
        unigrams_list = zip(get_zip_str, get_uni_float)

        ##### TFIDF FEATURE MATRIX #####
        uni_feat_tfidf = []
        for x in unigrams_list:
            if float(x[1]) > uni_avg_tfidf:
                uni_feat_tfidf.append(1)
            else:
                uni_feat_tfidf.append(0)
        zip_tfidf_feat = zip(get_zip_str, get_uni_float, uni_feat_tfidf)
        ###############################
        ##### First Sentence Feat #####
        uni_fir_sen = []
        for x in unigrams_list:
            file_name = 'traindata/doc'+f_name+'.txt'
            get_res = chk_frs_sen(x[0], file_name)
            if get_res == 1:
                uni_fir_sen.append(1)
            else:
                uni_fir_sen.append(0)
        zip_fir_sen_feat = zip(get_zip_str, get_uni_float, uni_feat_tfidf, uni_fir_sen)
        ############################
        ##### Involve in Title #####
        uni_title_feat = []
        for x in unigrams_list:
            get_res = involve_in_title(x[0], title)
            if get_res == 1:
                uni_title_feat.append(1)
            else:
                uni_title_feat.append(0)
        zip_uni_feats = zip(get_zip_str, get_uni_float, uni_feat_tfidf, uni_fir_sen, uni_title_feat)
        ############################
        ##### KEYWORD OR NOT #####
        key_uni_matx = []
        for x in unigrams_list:
            get_res = chk_keyword(x[0],key_unis)
            if get_res == 1:
                key_uni_matx.append(1)
            else:
                key_uni_matx.append(0)
        zip_uni_all_feat = zip(get_zip_str, get_uni_float, uni_feat_tfidf, uni_fir_sen, uni_title_feat, key_uni_matx)
        #########################################################
        
        ##### GETTING BIGRAMS #####
        ##Term Frequency for bigrams##
        total__tfidf = 0
        bi_tfidf_values = ''
        str_bi_grams = ''
        
        unzip_bigram = zip(*bag_of_biNP)
        str_bigrams = list(unzip_bigram[0])
        get_bigrams = zip(str_bigrams, str_bigrams[1:])[::2]
        bi_dist = nltk.FreqDist(bag_of_biNP)

        ##BI MAXIMUM TermScore##
        bi_scores = []
        for word in bi_dist:
            score = bi_dist[word]
            bi_scores.append(score)
        max_bi = max(bi_scores)
        ######################
    
        for word in bi_dist:
            tq_word = bi_dist[word]
            get_tf = term_frequency(tq_word, max_bi)
        
            ### FEATURES ###
            ##Tuple to String##
            to_string = ':'.join(word)
            get_this_string = convert_to_string(to_string)
        
            ##DF Score
            num_of_doc_word = count_nterm_doc(get_this_string)
        
            ##TF.IDF Score
            idf_score = inverse_df(total_docs, num_of_doc_word)
            tf_idf_scr = get_tf*idf_score
            total__tfidf += tf_idf_scr

            ##GET EACH BIGRAMS TFIDF
            get_tfidf_scr = repr(tf_idf_scr)+' '
            bi_tfidf_values += get_tfidf_scr
            str_bi_grams += get_this_string+','
            
        ##BUILD DICT FOR EACH TERMS
        get_float = [float(x) for x in bi_tfidf_values.split()]
        get_bi_list = str_bi_grams.split(',')
        bigram_dict = dict(zip(get_bi_list, get_float))
        ###########################
    
        ##GET TFIDF FOR BIGRAMS##
        get_bi_floats = get_val_bipairs(bigram_dict, get_bigrams)
        get_zip = dict(zip(get_bigrams, get_bi_floats))
        ############
        real_avg_tfidf = (sum(map(float,get_bi_floats)))/(len(get_bi_floats))
        ###########################
        get_zip_str = [' '.join(item) for item in get_bigrams]
        ###Bigrams string with TFIDF###
        bigrams_list =  zip(get_zip_str, get_bi_floats)

        ##### TFIDF FEATURE MATRIX #####
        feat_tfidf_matx = []
        for x in bigrams_list:
            if float(x[1]) > real_avg_tfidf:
                feat_tfidf_matx.append(1)
            else:
                feat_tfidf_matx.append(0)
            
        tfidf_feat = zip(get_zip_str, get_bi_floats, feat_tfidf_matx)
        #################################
        #### FIRST SENTENCE FEATURE ####
        feat_fir_sen = []
        for x in tfidf_feat:
            file_name = 'traindata/doc'+f_name+'.txt'
            get_res = chk_frs_sen(x[0], file_name)
            if get_res == 1:
                feat_fir_sen.append(1)
            else:
                feat_fir_sen.append(0)
            
        fir_sen_feat = zip (get_zip_str, get_bi_floats, feat_tfidf_matx, feat_fir_sen)

        #### INVOLVE IN TITLE FEATURE ###
        feat_invol_tit = []
        for x in fir_sen_feat:
            get_res = involve_in_title(x[0], title)
            if get_res == 1:
                feat_invol_tit.append(1)
            else:
                feat_invol_tit.append(0)
        invol_tit_feat = zip (get_zip_str, get_bi_floats, feat_tfidf_matx, feat_fir_sen, feat_invol_tit)
        ##### KEYWORD OR NOT #####
        key_bi_matx = []
        for x in bigrams_list:
            get_res = chk_keyword(x[0],key_bis)
            if get_res == 1:
                key_bi_matx.append(1)
            else:
                key_bi_matx.append(0)
        zip_bi_all_feat = zip(get_zip_str, get_bi_floats, feat_tfidf_matx, feat_fir_sen, feat_invol_tit, key_bi_matx)
        #####################################
        ##### GETTING TRIGRAMS #####
        #Term Frequency for trigrams
        total__tfidf = 0
        tri_tfidf_values = ''
        str_tri_grams = ''
        
        unzip_trigram = zip(*bag_of_triNP)
        str_trigrams = list(unzip_trigram[0])
        get_trigrams = zip(str_trigrams, str_trigrams[1:], str_trigrams[2:])[::3]
        tri_dist = nltk.FreqDist(bag_of_triNP)

        ##TRI MAXIMUM TermScore##
        tri_scores = []
        for word in tri_dist:
            score = tri_dist[word]
            tri_scores.append(score)
        max_tri = max(tri_scores)
        ######################
    
        for word in tri_dist:
            tr_fq = tri_dist[word]
            get_tf = term_frequency(tr_fq, max_tri)
    
            ### FEATURES ###
            ##Tuple to String##
            to_string = ':'.join(word)
            get_this_string = convert_to_string(to_string)
            ##DF Score
            num_of_doc_word = count_nterm_doc(get_this_string)
            ##
            ##TF.IDF Score
            idf_score = inverse_df(total_docs, num_of_doc_word)
            tf_idf_scr = get_tf * idf_score
            total__tfidf += tf_idf_scr

            ##GET EACH TRIGRAMS TFIDF
            get_tfidf_scr = repr(tf_idf_scr)+' '
            tri_tfidf_values += get_tfidf_scr
            str_tri_grams += get_this_string+','
            
        ##BUILD DICT FOR EACH TERMS
        get_tri_float = [float(x) for x in tri_tfidf_values.split()]
        get_tri_list = str_tri_grams.split(',')
        trigram_dict = dict(zip(get_tri_list, get_tri_float))
        ###########################
    
        ##GET TFIDF FOR TRIGRAMS##
        get_tri_floats = get_val_tripairs(trigram_dict, get_trigrams)
        get_tri_zip = dict(zip(get_trigrams, get_tri_floats))
        ############
        tri_avg_tfidf = (sum(map(float,get_tri_floats)))/(len(get_tri_floats))
        ###########################
        get_ziptri_str = [' '.join(item) for item in get_trigrams]
        ###Bigrams string with TFIDF###
        trigrams_list =  zip(get_ziptri_str, get_tri_floats)
        ###########################
        ##### TFIDF FEATURE MATRIX #####
        tri_tfidf_matx = []
        for x in trigrams_list:
            if float(x[1]) > tri_avg_tfidf:
                tri_tfidf_matx.append(1)
            else:
                tri_tfidf_matx.append(0)
            
        tri_tfidf_feat = zip(get_ziptri_str, get_tri_floats, tri_tfidf_matx)
        ################################
        #### FIRST SENTENCE FEATURE ####
        tri_fir_sen = []
        for x in tri_tfidf_feat:
            file_name = 'traindata/doc'+f_name+'.txt'
            get_res = chk_frs_sen(x[0], file_name)
            if get_res == 1:
                tri_fir_sen.append(1)
            else:
                tri_fir_sen.append(0)
            
        tri_sen_feat = zip (get_ziptri_str, get_tri_floats, tri_tfidf_matx, tri_fir_sen)
        #################################
        #### INVOLVE IN TITLE FEATURE ###
        tri_invol_tit = []
        for x in tri_sen_feat:
            get_res = involve_in_title(x[0], title)
            if get_res == 1:
                tri_invol_tit.append(1)
            else:
                tri_invol_tit.append(0)
        tri_tit_feat = zip (get_ziptri_str, get_tri_floats, tri_tfidf_matx, tri_fir_sen, tri_invol_tit)
        ##################################################
        ##### KEYWORD OR NOT #####
        key_tri_matx = []
        for x in trigrams_list:
            get_res = chk_keyword(x[0],key_tris)
            if get_res == 1:
                key_tri_matx.append(1)
            else:
                key_tri_matx.append(0)
        zip_tri_all_feat = zip(get_ziptri_str, get_tri_float, tri_tfidf_matx, tri_fir_sen, tri_invol_tit, key_tri_matx)
        #########################################################
        ##### GETTING 4-GRAMS #####
        #Term Frequency for 4-grams
        if (len(bag_of_fourNP)>0):
            
            total__tfidf = 0
            four_tfidf_values = ''
            str_four_grams = ''
            ###############
            unzip_fourgram = zip(*bag_of_fourNP)
            str_fourgrams = list(unzip_fourgram[0])
            get_fourgrams = zip(str_fourgrams, str_fourgrams[1:], str_fourgrams[2:], str_fourgrams[3:])[::4]
            ############################
            f_dist = nltk.FreqDist(bag_of_fourNP)
            ##4 MAXIMUM TermScore##
            four_scores = []
            for word in f_dist:
                score = f_dist[word]
                four_scores.append(score)
            max_four = max(four_scores)
            ######################
            
            for word in f_dist:
                fr_fq = f_dist[word]
                get_tf = term_frequency(fr_fq, max_four)

                ### FEATURES ###
                ##Tuple to String##
                to_string = ':'.join(word)
                get_this_string = convert_to_string(to_string)
                ##DF Score
                num_of_doc_word = count_nterm_doc(get_this_string)
                ##TF.IDF Score
                idf_score = inverse_df(total_docs, num_of_doc_word)
                tf_idf_scr = get_tf * idf_score
                total__tfidf += tf_idf_scr

                ##GET EACH FOURGRAMS TFIDF
                get_tfidf_scr = repr(tf_idf_scr)+' '
                four_tfidf_values += get_tfidf_scr
                str_four_grams += get_this_string+','

            ##BUILD DICT FOR EACH TERMS
            get_four_float = [float(x) for x in four_tfidf_values.split()]
            get_four_list = str_four_grams.split(',')
            fourgram_dict = dict(zip(get_four_list, get_four_float))
            ###########################

            ##GET TFIDF FOR 4-GRAMS##
            get_four_floats = get_val_fpairs(fourgram_dict, get_fourgrams)
            get_four_zip = dict(zip(get_fourgrams, get_four_floats))
            ############
            four_avg_tfidf = (sum(map(float,get_four_floats)))/(len(get_four_floats))
            ###########################
            get_zipfour_str = [' '.join(item) for item in get_fourgrams]
            ###Bigrams string with TFIDF###
            fourgrams_list =  zip(get_zipfour_str, get_four_floats)
            ###########################
            ##### TFIDF FEATURE MATRIX #####
            four_tfidf_matx = []
            for x in fourgrams_list:
                if float(x[1]) > four_avg_tfidf:
                   four_tfidf_matx.append(1)
                else:
                    four_tfidf_matx.append(0)
            
            four_tfidf_feat = zip(get_zipfour_str, get_four_floats, four_tfidf_matx)
            #################################
            #### FIRST SENTENCE FEATURE ####
            four_fir_sen = []
            for x in four_tfidf_feat:
                file_name = 'traindata/doc'+f_name+'.txt'
                get_res = chk_frs_sen(x[0], file_name)
                if get_res == 1:
                    four_fir_sen.append(1)
                else:
                    four_fir_sen.append(0)
            
            four_sen_feat = zip (get_zipfour_str, get_four_floats, four_tfidf_matx, four_fir_sen)
            #################################
            #### INVOLVE IN TITLE FEATURE ###
            four_invol_tit = []
            for x in tri_sen_feat:
                get_res = involve_in_title(x[0], title)
                if get_res == 1:
                    four_invol_tit.append(1)
                else:
                    four_invol_tit.append(0)
            four_tit_feat = zip (get_zipfour_str, get_four_floats, four_tfidf_matx, four_fir_sen, four_invol_tit)
            ##### KEYWORD OR NOT #####
            key_four_matx = []
            for x in fourgrams_list:
                get_res = chk_keyword(x[0],key_fours)
                if get_res == 1:
                    key_four_matx.append(1)
                else:
                    key_four_matx.append(0)
            zip_four_all_feat = zip(get_zipfour_str, get_four_floats, four_tfidf_matx, four_fir_sen, four_invol_tit, key_four_matx)
            #########################################################
        else:
            print 'Pass4-gram'
            zip_four_all_feat = ''
        
        uni_collection +=  zip_uni_all_feat
        bi_collection += zip_bi_all_feat
        tri_collection += zip_tri_all_feat
        four_collection += zip_four_all_feat
        
        total_unigram = len(uni_collection) ##UNIGRAM
        total_bigram = len(bi_collection) ##BIGRAM
        total_trigram = len(tri_collection) ##TRIGRAM
        total_fourgram = len(four_collection) ##FOURGRAM
        #######################
        print "Document "+n_files+" has been processed."
        count += 1

    ############################################
    get_uni_vals = cal_bayes(uni_collection)
    get_bi_vals = cal_bayes(bi_collection)
    get_tri_vals = cal_bayes(tri_collection)
    get_four_vals = cal_bayes(four_collection)
    ##### GET TFIDF DISTRIBUTIONS #####
    print '########## TFIDF DISTRIBUTIONS FOR N-GRAMS ##########'
    print dist_tfidf(get_uni_vals)
    print dist_tfidf(get_bi_vals)
    print dist_tfidf(get_tri_vals)
    print dist_tfidf(get_four_vals)
    ###################################
    ##### GET FIRST SENTENCE DISTRIBUTIONS #####
    print '########## FIRST SEN. DISTRIBUTIONS FOR N-GRAMS ##########'
    print dist_firsen(get_uni_vals)
    print dist_firsen(get_bi_vals)
    print dist_firsen(get_tri_vals)
    print dist_firsen(get_four_vals)
    ############################################
    ##### GET TITLE DISTRIBUTIONS #####
    print '########## TITLE DISTRIBUTIONS FOR N-GRAMS ##########'
    print dist_title(get_uni_vals)
    print dist_title(get_bi_vals)
    print dist_title(get_tri_vals)
    print dist_title(get_four_vals)
    ###################################

    ##### PRODUCE TEXT #####
    print '########## STORE INTO TEXT ##########'
    matrix_txt('uni_tf.txt',dist_tfidf(get_uni_vals))
    matrix_txt('uni_fs.txt',dist_firsen(get_uni_vals))
    matrix_txt('uni_tit.txt',dist_title(get_uni_vals))

    matrix_txt('bi_tf.txt',dist_tfidf(get_bi_vals))
    matrix_txt('bi_fs.txt',dist_firsen(get_bi_vals))
    matrix_txt('bi_tit.txt',dist_title(get_bi_vals))

    matrix_txt('tri_tf.txt',dist_tfidf(get_tri_vals))
    matrix_txt('tri_fs.txt',dist_firsen(get_tri_vals))
    matrix_txt('tri_tit.txt',dist_title(get_tri_vals))

    matrix_txt('four_tf.txt',dist_tfidf(get_four_vals))
    matrix_txt('four_fs.txt',dist_firsen(get_four_vals))
    matrix_txt('four_tit.txt',dist_title(get_four_vals))