Ejemplo n.º 1
0
def _get_question_subject(question, doc_name):
    question_subject = keywords.extract_keywords(NLP, question, 1)

    if _matches_main_doc_subject(question_subject, doc_name):
        question_subject = doc_name
    elif "complication" in question_subject or "complication" in question:
        question_subject = "complications of " + doc_name
    return question_subject
Ejemplo n.º 2
0
def _enrich_qna(qnadoc):
    allwords = ""

    for qna_pair in qnadoc.qnaList:
        qna_pair.add_metadata(
            "keywords", keywords.extract_keywords(NLP, qna_pair.answer, 5))

        subject = _get_question_subject(qna_pair.questions[0],
                                        qnadoc.name.lower())
        qna_pair.add_metadata("subject", subject)

        allwords += qna_pair.questions[0]
        allwords += qna_pair.answer

    qnadoc.add_metadata("keywords",
                        keywords.extract_keywords(NLP, allwords, 10))

    return qnadoc
Ejemplo n.º 3
0
def retrieve_keywords():
    try:
        txtfile = glob.glob(
            os.path.join(app.config['SCRAPE_OUTPUT_FOLDER'],
                         flask.session['sid'], '*', '*.txt'))[0]
        results = extract_keywords(txtfile, dict(flask.session))
    except Exception as e:
        return "PDFMAGIC_ERROR: This item is not currently available"
    return flask.jsonify(results)
Ejemplo n.º 4
0
def analyse_sentence(sentence):
    '''Takes a tweet and performs sentiment analysis on the given
    tweet, then gives the weight that was returned from the sentiment
    analysis

    TODO: Is this function neccesary? HALF-DEPRECATED'''
    
    sentiment = analyse_sentiment(sentence)
    keywordtuples = extract_keywords(sentence)
    return [(keyword,sentiment*weight) for (keyword,weight) in keywordtuples]
Ejemplo n.º 5
0
def get_results():
    text = request.form['text']
    sentences = request.form['sentences_count']
    is_summary = 'summary' in request.form
    # print(request.form)
    # print('is', is_summary)
    if is_summary:
        return render_template('summary.html',
                               summary=summarize(text, int(sentences)))
    return render_template('keywords.html',
                           keywords=extract_keywords(text, int(sentences)))
Ejemplo n.º 6
0
def analyse_sentences_var_1(sentences):
    '''Does analysis of all sentences and returns a compilation of all results in the form of two
    lists in the magical and fantastical format we all know and love.

    ...'''

    hatekeywords = {}
    lovekeywords = {}
    for sentence in sentences:
        sentiment = analyse_sentiment(sentence)
        for (keyword, weight) in extract_keywords(sentence):
            a = lovekeywords if sentiment > 0.0 else hatekeywords  # choose where to put keyword
            a[keyword] = a.get(keyword, 0.0) + weight*abs(sentiment) # only positive weights in end result

    return (lovekeywords.items(), hatekeywords.items())
Ejemplo n.º 7
0
def extract_keywords_test(filename, keywords_count):
    text = get_text_from_file(filename)
    result = extract_keywords(text, keywords_count)
    print(result)
    print(len(result))
def main(dom_choice, domain_list):
    if (dom_choice > len(domain_list)):
        print "Wrong choice"
        return "Wrong choice"
    domain = domain_list[dom_choice - 1]
    f = open("../datasets/Brands/" + domain.lower() + ".pickle", 'rb')
    object_file = pickle.load(f)
    prodslist = {}
    c = 0
    brandslist = {}
    prodslist = {}
    for brand in object_file.keys():
        #brand.append(line.split('|')[0])
        brandslist[c + 1] = brand
        print str(c + 1) + ". " + brand + "\n"
        c += 1

    print "Enter your choice"
    ch = int(raw_input())
    #ch=ch-1
    selectedBrand = brandslist[ch]
    print selectedBrand
    c = 0
    for prods in range(len(object_file[selectedBrand])):
        for prod in object_file[selectedBrand][prods].keys():
            prodslist[c + 1] = object_file[selectedBrand][prods][prod]
            print str(c + 1) + ". " + prod + "\n"
        c += 1

    print "Enter your choice"
    ch = int(raw_input())
    #ch=ch-1
    print "1.Summary using Text Rank"
    print "2.Summary using TF-IDF"
    print "Enter your choice"
    choice = int(raw_input())

    summary = ""

    if choice == 1:
        print "Do you want to enable debugging (Y/N)?"
        ch_debug = raw_input().lower()
        if ch_debug == "y" or ch_debug == "yes":
            rankedText = TextRank.summaryGen(prodslist[ch],
                                             domain,
                                             debugging=True)
        else:
            rankedText = TextRank.summaryGen(prodslist[ch], domain)

        f.close()
        sleep(3)
        #rankedText=rankedText[:len(rankedText)/3]

    if choice == 2:
        print "Do you want to enable debugging (Y/N)?"
        ch_debug = raw_input().lower()
        print "Do you want to enter the token size (Y/N)?"
        ch_token = raw_input().lower()
        if ch_debug == "y" or ch_debug == "yes":
            if ch_token == "y" or ch_token == "yes":
                print "Enter token size"
                token = int(raw_input())
                rankedText = TFIDFSummary.summaryGen(prodslist[ch],
                                                     domain,
                                                     gram=token,
                                                     debug=True)
            else:
                rankedText = TFIDFSummary.summaryGen(prodslist[ch],
                                                     domain,
                                                     debug=True)
        else:
            if ch_token == "y" or ch_token == "yes":
                print "Enter token size"
                token = int(raw_input())
                rankedText = TFIDFSummary.summaryGen(prodslist[ch],
                                                     domain,
                                                     gram=token)
            else:
                rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain)

    keys = keywords.extract_keywords(domain, prodslist[ch])
    rankedSummary = ""
    for i in range(len(rankedText)):
        rankedSummary += rankedText[i]
    stopwords = load_stop_words("../stoplist.txt")
    tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE)
    tokens = tokenizer.tokenize(rankedSummary)
    tokens = [token for token in tokens if token.lower() not in stopwords]
    precision = float(len(set(tokens).intersection(set(keys)))) / float(
        len(tokens))
    recall = float(len(set(tokens).intersection(set(keys)))) / float(len(keys))
    fmeasure = 2 * (precision * recall) / (precision + recall)
    print "\n\n"
    print "Precision =", precision
    print "Recall =", recall
    print "F-Measure =", fmeasure
def main(dom_choice, domain_list):
    if (dom_choice > len(domain_list)):
        print("Wrong choice")
        return "Wrong choice"
    domain = domain_list[dom_choice - 1]
    object_file = load(BRANDS_PARSED_PATH + '/' + domain.lower() + ".npz",
                       allow_pickle=True)
    object_file = object_file['arr_0'].tolist()
    print(object_file)
    prodslist = {}
    c = 0
    brandslist = {}
    prodslist = {}
    for brand in object_file.keys():
        brandslist[c + 1] = brand
        print(str(c + 1) + ". " + brand + "\n")
        c += 1

    print("Enter your choice")
    ch = int(input())
    selectedBrand = brandslist[ch]
    print(selectedBrand)
    c = 0
    for prods in range(len(object_file[selectedBrand])):
        for prod in object_file[selectedBrand][prods].keys():
            prodslist[c + 1] = object_file[selectedBrand][prods][prod]
            print(str(c + 1) + ". " + prod + "\n")
        c += 1

    print("Enter your choice")
    ch = int(input())
    print("1.Summary using Text Rank")
    print("2.Summary using TF-IDF")
    print("Enter your choice")
    choice = int(input())

    summary = ""

    if choice == 1:
        print("Do you want to enable debugging (Y/N)?")
        ch_debug = input().lower()
        if ch_debug == "y" or ch_debug == "yes":
            rankedText = TextRank.summaryGen(prodslist[ch],
                                             domain,
                                             debugging=True)
        else:
            rankedText = TextRank.summaryGen(prodslist[ch], domain)

        sleep(3)

    if choice == 2:
        print("Do you want to enable debugging (Y/N)?")
        ch_debug = input().lower()
        print("Do you want to enter the token size (Y/N)?")
        ch_token = input().lower()
        if ch_debug == "y" or ch_debug == "yes":
            if ch_token == "y" or ch_token == "yes":
                print("Enter token size")
                token = int(input())
                rankedText = TFIDFSummary.summaryGen(prodslist[ch],
                                                     domain,
                                                     gram=token,
                                                     debug=True)
            else:
                rankedText = TFIDFSummary.summaryGen(prodslist[ch],
                                                     domain,
                                                     debug=True)
        else:
            if ch_token == "y" or ch_token == "yes":
                print("Enter token size")
                token = int(input())
                rankedText = TFIDFSummary.summaryGen(prodslist[ch],
                                                     domain,
                                                     gram=token)
            else:
                rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain)

    keys = keywords.extract_keywords(domain, prodslist[ch])
    rankedSummary = ""
    for i in range(len(rankedText)):
        rankedSummary += rankedText[i]
    stopwords = load_stop_words("../stoplist.txt")
    tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE)
    tokens = tokenizer.tokenize(rankedSummary)
    tokens = [token for token in tokens if token.lower() not in stopwords]
    precision = float(len(set(tokens).intersection(set(keys)))) / float(
        len(tokens))
    recall = float(len(set(tokens).intersection(set(keys)))) / float(len(keys))
    fmeasure = 2 * (precision * recall) / (precision + recall)
    print("\n\n")
    print("Precision =", precision)
    print("Recall =", recall)
    print("F-Measure =", fmeasure)
Ejemplo n.º 10
0
def parse_custom(domain="cellphones"):
    global fakeness
    brands = load(BRANDS_PARSED_PATH + '/' + domain.lower() + '.npz',
                  allow_pickle=True)
    brands = brands['arr_0'].tolist()
    prodslist = {}
    c = 0
    brandslist = {}
    prodslist = {}
    for brand in brands.keys():
        brandslist[c + 1] = brand
        print(str(c + 1) + ". " + brand + "\n")
        c += 1

    print("Enter your choice")
    ch = int(input())
    selectedBrand = brandslist[ch]
    c = 0
    for prods in range(len(brands[selectedBrand])):
        for prod in brands[selectedBrand][prods].keys():
            prodslist[c + 1] = brands[selectedBrand][prods][prod]
            print(str(c + 1) + ". " + prod + "\n")
        c += 1
    print("Enter your choice")
    ch = int(input())
    review = ""
    print("Enter your review")
    while True:
        try:
            line = input("")
        except EOFError:
            break
        review += line
    print("\nPlease wait a moment. Processing the result...\n")
    keywords_list = keywords.extract_keywords(domain, prodslist[ch])
    stopwords = nltk.corpus.stopwords.words()
    brandslist = []
    for i in brands.keys():
        brand_words = i.split()
        brandslist += brand_words
        for j in brands[i]:
            title_words = j.items()[0][0].split()
            brandslist += title_words
    brandslist = [token for token in brandslist if token not in stopwords]
    brandslist = set(brandslist)
    vocabulary = []
    reviewList = []
    stopwords = nltk.corpus.stopwords.words()
    tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE)
    text = nltk.word_tokenize(review)
    cnt = 0
    keyword_cnt = 0
    for i in text:
        if i in brandslist:
            cnt += 1
        if i in keywords_list:
            keyword_cnt += 1
    pos_tagged = nltk.pos_tag(text)
    analyze_text = TextBlob(review)
    counts = Counter(tag for word, tag in pos_tagged)
    caps = len(filter(lambda x: x in string.uppercase, review))
    review_status = [0 for i in range(7)]
    review_data = [0 for i in range(7)]
    if len(review) != 0:
        c = Counter(c for c in review if c in ["?", "!"])
        review_data[0] = float(counts['PRP$']) / float(len(analyze_text.words))
        review_data[1] = analyze_text.subjectivity
        review_data[2] = float(caps) / len(review)
        review_data[3] = float(c["?"] + c["!"]) / len(review)
        review_data[4] = float(len(analyze_text.words)) / 1000
        review_data[5] = float(cnt) / float(len(analyze_text.words))
        review_data[6] = float(keyword_cnt) / float(len(analyze_text.words))
    if review_data[0] > 0:
        review_status[0] = 1
    if review_data[1] < 0.5:
        review_status[1] = 1
    if review_data[2] >= 0.5:
        review_status[2] = 1
    if review_data[3] >= 0.1:
        review_status[3] = 1
    if review_data[4] <= 0.135:
        review_status[4] = 1
    if review_data[5] >= 0.5 or review_data[5] <= 0.1:
        review_status[5] = 1
    if review_data[6] < 0.5:
        review_status[6] = 1
    detection_counter = collections.Counter(review_status)
    deceptive_level = (float(detection_counter[1]) / 7) * 100
    fakeness = deceptive_level
    return review_data
		if ch_debug=="y" or ch_debug=="yes":
			if ch_token=="y" or ch_token=="yes":
				print "Enter token size"
				token=int(raw_input())
				rankedText=TFIDFSummary.summaryGen(prodslist[ch],domain,gram=token,debug=True)
			else:
				rankedText=TFIDFSummary.summaryGen(prodslist[ch],domain,debug=True)
		else:
			if ch_token=="y" or ch_token=="yes":
				print "Enter token size"
				token=int(raw_input())
				rankedText=TFIDFSummary.summaryGen(prodslist[ch],domain,gram=token)
			else:
				rankedText=TFIDFSummary.summaryGen(prodslist[ch],domain)

	keys=keywords.extract_keywords(domain,prodslist[ch])
	rankedSummary=""
	for i in range(len(rankedText)):
		rankedSummary+=rankedText[i]
	stopwords=load_stop_words("../stoplist.txt")
	tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE)
	tokens = tokenizer.tokenize(rankedSummary)
	tokens = [token for token in tokens if token.lower() not in stopwords]
	precision = float(len(set(tokens).intersection(set(keys))))/float(len(tokens))
	recall = float(len(set(tokens).intersection(set(keys))))/float(len(keys))
	fmeasure = 2*(precision*recall)/(precision+recall)
	print "\n\n"
	print "Precision =",precision
	print "Recall =",recall
	print "F-Measure =",fmeasure