Esempio n. 1
0
def rifu():
    global pre_append
    global keywords
    global doc_freq
    global url
    print "starting time:",
    now = datetime.datetime.now()
    print datetime.time(now.hour, now.minute, now.second)

    url = request.args.get('url', 0, type=str)

    #webpage = urllib2.urlopen(url).read()
    #para= re.compile('<p>(.*)</p>') #collect data in p tags and store in para object
    # raw = re.findall(para , webpage)
    # rawstr = ' '.join(raw)
    # text = tokenize(rawstr)
    #SCRAPE
    print "URL:" + url
    text = scrape.scrapePage(url)
    print "\nScraping Done"
    print text

    #GET KEYWORDS

    keywords = keywords_file.get_keywords(url, text)
    print "my keywords:::"
    print keywords
    #GET CONTEXT
    nearbywordsObj = getnearbywords_intokens.getnearbywords()
    doc_freq = nearbywordsObj.get_words_from_proximity(keywords, text)
    #    import pdb;pdb.set_trace();
    print "doc freq::::" + str(doc_freq)

    #GET SEARCH RESULTS
    results = ""
    results = search.search_web(doc_freq, results, 0, 4, url)
    pre_append = results

    tab_id = request.args.get('tab_id', 0, type=int)
    results += '{"tab_id":' + str(
        tab_id) + '}'  #Add dummy result for comma after last result
    results += "]}"

    print results
    return results
Esempio n. 2
0
def rifu():
    global pre_append
    global keywords
    global doc_freq
    global url
    print "starting time:",
    now = datetime.datetime.now()
    print datetime.time(now.hour, now.minute, now.second)
   
    url = request.args.get('url', 0, type=str)
    
    #webpage = urllib2.urlopen(url).read()
    #para= re.compile('<p>(.*)</p>') #collect data in p tags and store in para object
   # raw = re.findall(para , webpage)
   # rawstr = ' '.join(raw)
   # text = tokenize(rawstr)
    #SCRAPE
    print "URL:"+url
    text = scrape.scrapePage(url)
    print "\nScraping Done"
    print text

    #GET KEYWORDS
   
    keywords = keywords_file.get_keywords(url,text)
    print "my keywords:::"
    print keywords
    #GET CONTEXT
    nearbywordsObj = getnearbywords_intokens.getnearbywords()
    doc_freq = nearbywordsObj.get_words_from_proximity(keywords,text) 
#    import pdb;pdb.set_trace();
    print "doc freq::::"+ str(doc_freq)
    
    #GET SEARCH RESULTS
    results =""
    results = search.search_web(doc_freq,results,0,4,url)
    pre_append = results
   
    tab_id = request.args.get('tab_id',0,type=int)
    results +='{"tab_id":'+str(tab_id)+'}' #Add dummy result for comma after last result
    results += "]}"

    print results
    return results
Esempio n. 3
0
def main():
    obj = proper_noun()
    text = obj.scrape(sys.argv[1])
    title = alchemyObj.URLGetTitle(sys.argv[1])
    soup = BeautifulSoup(title)
    raw = soup('title')
    tokens_title_first = [str(title.text) for title in raw]
    #tokens_title = ['Three', 'Musketeers']
    print "title::",
    print tokens_title_first
    #text = original
    ### Take nouns in title
    tokens_title_first = str(tokens_title_first[0])
    print tokens_title_first
    tokens_title_temp = nltk.word_tokenize(tokens_title_first)
    tokens_title_pos = nltk.pos_tag(tokens_title_temp)
    print "tokens_title_temp::",
    print tokens_title_temp

    tokens_title = []      ##create duplicate list
    for t in tokens_title_temp:
    	index = tokens_title_temp.index(t)
        print "t::" + t
        print "index::" + str(index) 
        print "tag::" + tokens_title_pos[index][1]
        print "len" + str(len(t))
    	if (t.isalpha() and (tokens_title_pos[index][1] == "NNP") and (len(t) >= 3)):
           # tokens_title.remove(t)
            tokens_title.append(t)
    tokens_title.sort()
    tokens_title = list(tokens_title for tokens_title,_ in itertools.groupby(tokens_title))
    print "title::",
    print tokens_title 
    list_of_NNPs = obj.get_nnp_ngrams(text,5,0)
    
    #list_of_NNPs = [['Three','Musketeers'],['Alexandre', 'Dumas']]#,['Cardinal', 'Richelieu'],['Athos'],['Although'],['Porthos'] ]
    print "list of NNPs: ",
    print list_of_NNPs
    if len(list_of_NNPs)>3: ######
        list_of_NNPs = list_of_NNPs[0:3] ########
    doc_freq_obj = doc_freq_class.context()
    print "getting doc freq"
    max_df = []
    for n in list_of_NNPs:
        print "got n"
        max_freq = 0
        for t in tokens_title:
            print "got t"
            df = doc_freq_obj.get_together_DF(n,t)
            if df > max_freq:
                max_freq = df
            print "ngram:",
            print n
            print "title word:",
            print t
            print "df:",
            print df
        max_df.append(max_freq)
    i = 0
    for df in max_df:
        for i in range(len(max_df)-1):
            if max_df[i]<max_df[i+1]:
                t = list_of_NNPs[i]
                list_of_NNPs[i]=list_of_NNPs[i+1]
                list_of_NNPs[i+1]= t
                t1 = max_df[i]
                max_df[i]=max_df[i+1]
                max_df[i+1] = t1
    #i = 0
    for i in range(len(list_of_NNPs)):
        print "keyword: ",
        print list_of_NNPs[i] 
        print "df:",
        print max_df[i]
    if len(list_of_NNPs)>3:
        list_of_NNPs = list_of_NNPs[0:3]#*********
    #list_of_NNPs.sort()
    #list_of_NNPs_final = list(list_NNPs for list_NNPs,_ in itertools.groupby(list_of_NNPs))
    #list_of_NNPs_final.sort()
    print "\n\nfinal list:",
    print list_of_NNPs
    nearbywordsObj = getnearbywords_intokens.getnearbywords()
    nearbywordsObj.get_words_from_proximity(list_of_NNPs,text)