Beispiel #1
0
def get_phrases(text=''):
    rake = Rake()
    rake.extract_keywords_from_text(''.join(text))
    phrases = rake.get_ranked_phrases()
    if len(phrases) >= 5:
        return phrases[:5]
    else:
        return phrases
def phrase(ques):
    phrase=[]
    new_list=[]
    r=Rake()
    question=ques
    r.extract_keywords_from_text(question)
    phrase=r.get_ranked_phrases()
    for items in phrase:
        new_list.extend(items.lower().split())
    return new_list    
Beispiel #3
0
    qry = """
    select b.EXPERIENCE,b.QuestionTitle, a.Preprocessed_Responses TextForAnalysis from [CUS].[t_New_Preprocessed_Responses] a
    join [CUS].[t_SurveyQuestions_FY18H2GESS_Hierarchy] b
    on a.questionid=b.QuestionID
    where b.experience='Meeting' and a.Preprocessed_responses is not null
    """
    cur.execute(qry)
    rows = cur.fetchall()

    #cur.execute("TRUNCATE TABLE [CUS].[t1_ProcessingTextRank]")

    doc = ' '.join(row.TextForAnalysis for row in rows)
    #
    #    for row in rows:
    #        doc=' '.join(row.TextForAnalysis)
    r = Rake()
    r.extract_keywords_from_text(doc)
    keywords = r.get_ranked_phrases_with_scores()
    print(keywords)
    #
    ##    print(row.ID)
    #    keywords=' '.join(extract_key_phrases(row.TextForAnalysis))
    #    summary=extract_sentences(row.TextForAnalysis)
    #    cur.execute("""INSERT INTO [CUS].[t1_ProcessingTextRank] VALUES(?,?,?)""",(row.ID,keywords,summary))

    con.commit()
    con.close()

    print("Completed!")
if args["preprocess"] == "thresh":
	gray = cv2.threshold(gray, 0, 255,
		cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
 
# make a check to see if median blurring should be done to remove
# noise
elif args["preprocess"] == "blur":
	gray = cv2.medianBlur(gray, 3)
 
# write the grayscale image to disk as a temporary file so we can
# apply OCR to it
filename = "{}.png".format(os.getpid())
cv2.imwrite(filename, gray)
# load the image as a PIL/Pillow image, apply OCR, and then delete
# the temporary file
text = pytesseract.image_to_string(Image.open(filename)).encode('utf-8')
os.remove(filename)
print(text)

#get keywords
r = Rake()
r.extract_keywords_from_text(text.decode('utf-8'))
keywords = r.get_ranked_phrases_with_scores()
print("\n\nKEYWORDS:\n")
for word in keywords:
	print(word)
# show the output images
#cv2.imshow("Image", image)
#cv2.imshow("Output", gray)
#cv2.waitKey(0)
Beispiel #5
0
    def get_meta_keywords(self):
        r = Rake()
        r.extract_keywords_from_text(self.body)

        return r.get_ranked_phrases()[:10]
Beispiel #6
0
def keyword_processing(transcript) -> list:
    r = Rake(min_length=2, max_length=4)
    r.extract_keywords_from_text(transcript)
    ranked_keywords= r.get_ranked_phrases()
    return ranked_keywords
Beispiel #7
0
import operator
from rake_nltk import Rake
from rake_nltk import Metric
import diffbot

f = open('finalList.txt', 'r', encoding='utf-8')
words = f.read()
f.close()

stop = words.split('\n')

r = Rake(ranking_metric=Metric.WORD_FREQUENCY)

url = 'https://www.cnn.com/2020/08/21/politics/peter-rafael-dzibinski-debbins-green-beret-russia/index.html'
urlNoNames = 'https://www.britannica.com/science/influenza'
json_result = diffbot.article(urlNoNames,
                              token='d656578220cbf622d16575aba331d47d')

words = (json_result['objects'][0]['text'])
r.extract_keywords_from_text(words)

result = r.get_ranked_phrases_with_scores()

print(result)
def keywords(x):
    r = Rake()
    r.extract_keywords_from_text(x)
    return ' '.join(list(r.get_word_degrees().keys()))
Beispiel #9
0
def extract_key_word_rank(text):
    r = Rake()
    r.extract_keywords_from_text(text)
    print(r.get_ranked_phrases())
    print("==============================")
    print(r.get_word_degrees())
mc_client = base.Client(('127.0.0.1', 11211), allow_unicode_keys=True)

with open('results.txt', 'a+') as result_file:
    for experience in experienceData:
        experienceText = experience[2]
        if type(experienceText) == float:
            continue
        r = Rake(stopwords=stop_words)

        # Strip Links
        experienceText = re.sub(r'^https?://.*[\r\n]*',
                                '',
                                experienceText,
                                flags=re.MULTILINE)

        r.extract_keywords_from_text(experienceText)
        phraselist = r.get_ranked_phrases()

        experienceduration = []
        setofskills = []
        setofphrases = []

        found_skills = get_skills()
        print(found_skills)
        for result in found_skills:
            result_file.write(f"{result}\n")
        result_file.flush()
        complete_found_skill_list.append(experienceText)

print("==================================================")
print(complete_found_skill_list)
def getMovies(title):
    df = pd.read_csv('Movies.csv')
    
    #taking title, genre,plot and actors for recommendation process
    df = df[['Title','Genre','Director','Actors','Plot']]

    # cleaning the 3 columns and bringing it to shape
    df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3])
    df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))
    df['Director'] = df['Director'].map(lambda x: x.split(' '))

    # merging first and last name to avoid duplicates
    for index, row in df.iterrows():
        row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
        row['Director'] = ''.join(row['Director']).lower()

    df['Key_words'] = ""

    for index, row in df.iterrows():
        #extracting all unique words from every row and adding to key_words column
    	plot = row['Plot']
    	r = Rake()
    	r.extract_keywords_from_text(plot)
    	keywordScores = r.get_word_degrees()
    	row['Key_words'] = list(keywordScores.keys()) 

    df.drop(columns = ['Plot'], inplace = True)

    df.set_index('Title', inplace = True)
    df['bag_of_words'] = ''
    columns = df.columns

    for index, row in df.iterrows():
        #creating a BOG model with actor, director, title, plot
        words = ''
        for col in columns:
            if col != 'Director':
                words = words + ' '.join(row[col]) + ' '
            else:
                words = words + row[col] + ' '
        row['bag_of_words'] = words
        
    df.drop(columns = [col for col in df.columns if col != 'bag_of_words'], inplace = True)

    # instantiating and generating the count matrix
    count = CountVectorizer()
    count_matrix = count.fit_transform(df['bag_of_words'])
    titleIndex = pd.Series(df.index)
    titleIndex[:5]

    #generating the cosine similarity matrix
    cosineSimilarityMatrix = cosine_similarity(count_matrix, count_matrix)
    finalSelections = []

    #finding the row where the desired movie is located and taking the highest valus excluding the unit value
    idx = titleIndex[titleIndex == title].index[0]
    scoreSeries = pd.Series(cosineSimilarityMatrix[idx]).sort_values(ascending = False)
    topMovies = list(scoreSeries.iloc[1:11].index)
        
    #appending the recommendations in a list
    for i in topMovies:
        finalSelections.append(list(df.index)[i])    
            
    return finalSelections
def get_text_keyword(text):

    rake = Rake()
    rake.extract_keywords_from_text(text=text)

    return rake.get_ranked_phrases()
Beispiel #13
0
def justDoIt():
    ###############################################
    #### Update or verify the following values. ###
    ###############################################

    # Replace the subscription_key string value with your valid subscription key.
    subscription_key = 'd101f6aafa5c44208ead247cfb3d8b32'

    # Replace or verify the region.
    #
    # You must use the same region in your REST API call as you used to obtain your subscription keys.
    # For example, if you obtained your subscription keys from the westus region, replace
    # "westcentralus" in the URI below with "westus".
    #
    # NOTE: Free trial subscription keys are generated in the westcentralus region, so if you are using
    # a free trial subscription key, you should not need to change this region.
    uri_base = 'eastus2.api.cognitive.microsoft.com'

    headers = {
        # Request headers.
        # Another valid content type is "application/octet-stream".
        'Content-Type': 'application/octet-stream',
        'Ocp-Apim-Subscription-Key': subscription_key,
    }
    
    yaga = os.listdir('/Applications/XAMPP/xamppfiles/htdocs/Uploads/')
    filename1 = '/Applications/XAMPP/xamppfiles/htdocs/Uploads/'
    yaga2 = len(yaga) - 1
    filename3 = filename1 + yaga[yaga2]
    print(filename3)
    #filename2 = '/Applications/XAMPP/xamppfiles/htdocs/Uploads/handnotes3.jpg'
    k = open(filename3,'rb')
    body = k.read()
    k.close()


    # The URL of a JPEG image containing handwritten text.
    #body = "{'url':'C:/xampp/htdocs/bigredhax2017/Uploads/handnotes.jpg'}"

    # For printed text, set "handwriting" to false.
    params = urllib.urlencode({'handwriting' : 'true'})

    try:
        # This operation requrires two REST API calls. One to submit the image for processing,
        # the other to retrieve the text found in the image.
        #
        # This executes the first REST API call and gets the response.
        conn = httplib.HTTPSConnection(uri_base)
        conn.request("POST", "/vision/v1.0/RecognizeText?%s" % params, body, headers)
        response = conn.getresponse()

        # Success is indicated by a status of 202.
        if response.status != 202:
            # Display JSON data and exit if the first REST API call was not successful.
            parsed = json.loads(response.read())
            print ("Error:")
            print (json.dumps(parsed, sort_keys=True, indent=2))
            conn.close()
            exit()

        # The 'Operation-Location' in the response contains the URI to retrieve the recognized text.
        operationLocation = response.getheader('Operation-Location')
        parsedLocation = operationLocation.split(uri_base)
        answerURL = parsedLocation[1]

        # NOTE: The response may not be immediately available. Handwriting recognition is an
        # async operation that can take a variable amount of time depending on the length
        # of the text you want to recognize. You may need to wait or retry this GET operation.

        #print('\nHandwritten text submitted. Waiting 10 seconds to retrieve the recognized text.\n')
        time.sleep(10)

        # Execute the second REST API call and get the response.
        conn = httplib.HTTPSConnection(uri_base)
        conn.request("GET", answerURL, '', headers)
        response = conn.getresponse()
        data = response.read()

        # 'data' contains the JSON data. The following formats the JSON data for display.
        parsed = json.loads(data)
        print ("Response:")
        jsonInput = json.dumps(parsed, sort_keys=True, indent=2)
        conn.close()

    except Exception as e:
        print('Error:')
        print(e)


    ####################################
    # This is something which converts a given JSON-string to a better
    # string for keyword analysis

    def jsonToTxt(jsonString):
        substringListOne=jsonString.split('"text": ')
        substringListTwo=[]
        stronk=""
        for sub in substringListOne:
            sub=sub[1:]
            i=0
            for s in sub:
                if (s=='\"'):
                    break
                i+=1
            sub=sub[:i]
            if len(sub)==0:
                stronk+=sub
            else:
                stronk+=sub+" "
        return stronk

    stank = jsonToTxt(jsonInput)

    ###########
    # This will find the keywords in stank
    from rake_nltk import Rake
    r = Rake()
    a = r.extract_keywords_from_text(stank)
    b = r.get_ranked_phrases()
    print(b[0])
    return b[0]
Beispiel #14
0
os.remove(filename)

# print results---------------------------------------------

print(fixed_question)
print()
print(ocr_a1)
print(ocr_a2)
print(ocr_a3)
print()

# Clean up OCR'd question to just keywords

r = Rake(
)  # Uses stopwords for english from NLTK, and all punctuation characters.
r.extract_keywords_from_text(fixed_question)
phrases = r.get_ranked_phrases(
)  # To get keyword phrases ranked highest to lowest.
phrases_clean = (' '.join('"{0}"'.format(w) for w in phrases))

print("Original Question: ", fixed_question)
print("Extracted Phrases: ", phrases_clean)
print()

# google for result count------------------------------------

search = phrases_clean, " +", "\"", ocr_a1, "\"", " -", "\"", ocr_a2, "\"", " -", "\"", ocr_a3, "\""
searchclean1 = ''.join(search)

r1 = requests.get("https://www.google.com/search", params={'q': searchclean1})
        print(sentence)
sentence.draw()

from polyglot.text import Text
text = Text(text)
for sent in text.sentences:
    print(sent, "\n")
    for entity in sent.entities:
        print(entity.tag, entity)

with open(r'text file', encoding='utf-8') as file:
    novels = file.read()
print(novels[:106])

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))

# Get Arabic stopwords and print some of them
arb_stopwords = (nltk.corpus.stopwords.words('arabic'))
arb_stopwords[:414]

rake = Rake(stopwords=stopwords.words('arabic'),
            punctuations=',./»:،؛":.,’،\''.split(),
            language='arabic',
            max_length=3)
rake.extract_keywords_from_text(novels)
for phrase in rake.get_ranked_phrases()[:24]:
    print(phrase)
Beispiel #16
0
def extract_keywords(text):
    r = Rake()
    r.extract_keywords_from_text(text)
    keywords = r.get_ranked_phrases()
    return keywords[0:20]
Beispiel #17
0
#!/usr/bin/python3
# coding: utf-8
# pip install rake-nltk
from rake_nltk import Rake
from nltk import tokenize
r = Rake()  # Uses stopwords for english from NLTK, and all puntuation characters by default
##################################################################
## Extraction given the text.
mytext = '''Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered.
            Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given.
            These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.'''
r.extract_keywords_from_text(mytext)
print(r.get_ranked_phrases())  # To get keyword phrases ranked highest to lowest.
# ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility']
print(r.get_ranked_phrases_with_scores())  # To get keyword phrases ranked highest to lowest with scores.
# [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]
##################################################################
## Extraction given the list of strings where each string is a sentence.
r.extract_keywords_from_sentences(tokenize.sent_tokenize(mytext))
print(r.get_ranked_phrases())
# ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility']
print(r.get_ranked_phrases_with_scores())
# [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]
Beispiel #18
0
    if column == 'main_speaker':
        final_ted[column] = final_ted[column].apply(combine_string)
    if column in ['title', 'url', 'tags']:
        continue
    final_ted[column] = final_ted[column].apply(remove_punctuation)

# Distilling TED Talk description down to key words for each talk
final_ted['key_words'] = ""
for index, row in final_ted.iterrows():
    desc = row['description']

    # Uses a NLTK Rake object. English stopwords and punctuation removed.
    rake = Rake()

    # Extracting key words from TED Talk description
    rake.extract_keywords_from_text(desc)

    # Key words and scores for key words
    dict_keys_scores = rake.get_word_degrees()

    # assigning the key words to the new column for the corresponding movie
    row['key_words'] = [row['main_speaker']] + list(dict_keys_scores.keys())

# Removing description column
final_ted.drop(columns=['description'], inplace=True)

# New data frame with keywords, indexed by title. Converts key_words lists to comma-delimited string
keyword_df = final_ted.filter(['key_words'])
keyword_df = keyword_df.set_index(final_ted['title'])

for i in range(len(keyword_df['key_words'])):
Beispiel #19
0
    for ss in s1:
        if ss not in stop_words:
            c = c + 1

    sentence1 = sentence1.lower().split()

    s2 = word_tokenize(sentence2.lower())
    for ss in s2:
        if ss not in stop_words:
            c = c + 1

    sentence2 = sentence2.lower().split()
    return model.wmdistance(sentence1, sentence2) / (c * 1.0)


def rogue2_bleu(gt, pred):
    tokens = nltk.word_tokenize(gt)
    bigramgt = set(nltk.bigrams(tokens))
    tokens = nltk.word_tokenize(pred)
    bigrampred = set(nltk.bigrams(tokens))

    return (len(bigramgt.intersection(bigrampred))) / (len(bigramgt) * 1.0), (
        len(bigramgt.intersection(bigrampred))) / (len(bigrampred) * 1.0)


query = raw_input("Enter query\n")
tag = r.extract_keywords_from_text(query)
ranked_tags = r.get_ranked_phrases()
print 'Quote=' + str(get_quote(query, ranked_tags))
print 'QNA=' + str(get_qna(query, ranked_tags).encode('utf-8'))
Beispiel #20
0
from nltk.stem.porter import PorterStemmer
from nltk.probability import FreqDist
import operator

import docx
import pandas as pd
from rake_nltk import Rake


xcel = pd.read_excel('C:\\Users\\kartikaya\\Python_Data\\Article_Search_MindTrust\\FUNDS\\Funds-sectors.xlsx')
funds = xcel.iloc[:, 0]
tags = xcel.iloc[:, 1]
isin = xcel.iloc[:, 2]
all_files = os.listdir("C:\\Users\\kartikaya\\Python_Data\\Article_Search_MindTrust\\Articles")

file = 'C:\\Users\\kartikaya\\Python_Data\\Article_Search_MindTrust\\Articles\\' + all_files[0]

doc = docx.Document(file)

full_text = []

for para in doc.paragraphs:
    full_text.append(para.text)
    '\n'.join(full_text)

    
r = Rake()
r.extract_keywords_from_text(str(full_text))
print(r.get_ranked_phrases_with_scores())

Beispiel #21
0
def load_events_for_date(request, date):
    # get the json with events for a specific date
    bundle_type = 'medium'
    limit = 9999
    offset = 0
    status = 'ongoing'
    event_list_json = requests.get(
        f"https://www.blogto.com/api/v2/events/?bundle_type={bundle_type}&date={date}&limit={limit}&offset={offset}&status={status}"
    )

    # convert json received into a python dictionary
    event_list = json.loads(event_list_json.content)
    print(event_list)

    r = Rake(min_length=1, max_length=1)

    for event_summary in event_list["results"]:
        event_full_json = requests.get(
            f"https://www.blogto.com/api/v2/events/{event_summary['id']}")
        event_full = json.loads(event_full_json.content)
        print(">>> Event Title:", event_full['title'], "<<<")

        r.extract_keywords_from_text(event_full["title"])
        word_list = r.get_ranked_phrases()

        try:
            if event_full["location"]:
                location, location_created = Location.objects.get_or_create(
                    latitude=event_full["location"]["latitude"],
                    longitude=event_full["location"]["longitude"],
                    defaults={
                        'address': event_full['address'],
                        'city': event_full['city'],
                        'province': event_full['province']
                    })
            else:
                location, location_created = Location.objects.get_or_create(
                    latitude=None,
                    longitude=None,
                    city=event_full['city'],
                    defaults={
                        'address': event_full['address'],
                        'province': event_full['province']
                    })
        except Event.MultipleObjectsReturned:
            print("Duplicate location: " + str(event_full["location"]))
        try:
            event_object, event_created = Event.objects.get_or_create(
                blogto_id=event_full["id"],
                date=date,
                defaults={
                    'title': event_full["title"],
                    'description': event_full["description_stripped"],
                    'image_url':
                    event_full["image_url"] + "?width=1920&height=1080",
                    'start_time': event_summary["start_time"],
                    'end_time': event_summary["end_time"],
                    'venue_name': event_full["venue_name"],
                    'location': location
                })
            # looping through events, creating a list of keywords, looping through keywords to create keyword object for each
            # only if word has not been previously created
            print("List of keywords:")
            for word in word_list:
                try:
                    kword, kword_created = Keyword.objects.get_or_create(
                        word=word)
                    event_object.keywords.add(kword)
                    print(word)
                except Keyword.MultipleObjectsReturned:
                    print("Duplicate keyword")
        except Event.MultipleObjectsReturned:
            print("Duplicate event Id: " + str(event_full["id"]))

    return HttpResponse(f"Loaded events into db for {date}.")
Beispiel #22
0
def senti(noOfSearchTerms, topicname):

    consumer_key = 'g6hWMVoCGEWaYDWg3Km3YaehA'
    consumer_secret = 'KqMmBhdAsSRcBTO7w18hzBYm4G4BgbfWYHc7lfSmPDUvbCBh4U'

    access_token_key = '582955639-7fbAiHMKNz4Mizm26Jbcp0yX9mzMa9GhEyYhoXb3'
    access_token_secret = 'QoQT7WIZVuQ3HD7ipSuA7MxvOxcHA94suGiOeFvVBXE5x'

    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token_key, access_token_secret)

    api = tweepy.API(auth)
    noOfSearchTerms = noOfSearchTerms
    topicname = topicname
    h1 = noOfSearchTerms
    noOfSearchTerms = noOfSearchTerms + 1000  ### 500 extra tweet so the data can be processed

    tweets = tweepy.Cursor(api.search, q=topicname,
                           lang="en").items(noOfSearchTerms)

    unwanted_words = ['@', 'RT', ':', 'https', 'http']
    symbols = ['@', '#']
    data = []
    url = {}
    n1 = h1
    pp1 = 0
    pp2 = 0
    pp3 = 0
    s1 = 0
    f = 0
    mmm = 0
    n = []
    pos = []
    neg = []
    neu = []
    posneg = []
    r = 0
    total = []
    ttest = []
    urls = []
    times = []
    for tweet in tweets:
        z1 = 0
        if (s1 == h1 + 1):  #for the tweets the user have enter
            break
        time = tweet.created_at
        url = 'https://twitter.com/statuses/' + tweet.id_str
        text = tweet.text
        textWords = text.split()
        u = 0
        cleanedTweet = ' '.join(
            re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT)", " ",
                   text).split())
        if (f == 1):
            while (z1 < s1):  ##   for the retweets
                if (n[z1] == cleanedTweet):
                    z1 = z1 + 1
                    u = 1
                else:
                    z1 = z1 + 1
        r = 0
        if (len(cleanedTweet.split()) > 5 and u == 0):
            analysis = TextBlob(cleanedTweet)
            polarity = 'Positive'
            total.append(cleanedTweet)
            if (analysis.sentiment.polarity < 0):
                polarity = 'Negative'
                n1 = n1 - 1
                pp2 = pp2 + 1
                neg.append(cleanedTweet)
                posneg.append(cleanedTweet)
                ttest.append(3)
                mmm = mmm + 1
            elif (0 <= analysis.sentiment.polarity <= 0.2):
                polarity = 'Neutral'
                n1 = n1 - 1
                pp1 = pp1 + 1
                neu.append(cleanedTweet)
                posneg.append(cleanedTweet)
                ttest.append(4)
                mmm = mmm + 1
            else:
                pos.append(cleanedTweet)
                posneg.append(cleanedTweet)
                ttest.append(2)
                pp3 = pp3 + 1
                mmm = mmm + 1
            s2 = cleanedTweet
            s1 = s1 + 1
            dic = {}
            dic['Sentiment'] = polarity
            dic['Tweet'] = cleanedTweet
            dic['URL'] = url
            dic['Time'] = time
            dic['Sentiment Scores'] = analysis.sentiment
            data.append(dic)
            urls.append(url)
            times.append(time)

            df = pd.DataFrame(data)
            df.to_csv('analysis.csv')
            f = 1
            n.append(cleanedTweet)
        else:
            salu = 500
    print("number of positive tweets are ", pp3)
    print("number of neutral tweets are ", pp1)
    print("number of negative tweets are ", pp2)

    ###bag of words

    vectorizer = CountVectorizer()
    salman = vectorizer.fit_transform(posneg)
    zzz = salman.toarray()

    ###classfiers for making xtrain ytrain xtest

    count = 0
    xtest = []
    xneu = []
    xtrain = []
    ytrain = []
    ytest = []
    ytest1 = []
    ytest2 = []
    while (count < len(ttest)):
        if (ttest[count] == 2):
            xtrain.append(zzz[count])
            ytrain.append(2)
            count = count + 1
        elif (ttest[count] == 3):
            xtrain.append(zzz[count])
            ytrain.append(3)
            count = count + 1
        elif (ttest[count] == 4):
            xneu.append(zzz[count])
            count = count + 1

    ###logrog

    xtrain1 = np.array(xtrain)
    clf = linear_model.SGDClassifier(max_iter=1000, shuffle=False, loss='log')
    clf.fit(xtrain1, ytrain)
    value = 0
    while (value < len(xneu)):
        um = xneu[value]
        a = clf.predict([um])
        aa = clf.predict_log_proba([um])
        hh = aa[0]
        hhh = hh[1]
        hhhh = hh[0]
        q11 = math.exp(hhh)
        q12 = math.exp(hhhh)
        if (q11 < q12):
            mm = q12 - q11
        else:
            mm = q11 - q12
        if (mm < 0.1):
            ytest.append([4])
        else:
            ytest.append(a)
        value = value + 1

    #print("log")

    #print(len(ytest))
    #print(ytest)

    ### svm

    xtrain2 = np.array(xtrain)
    clf1 = SVC(kernel='linear', probability=True)
    clf1.fit(xtrain2, ytrain)
    value = 0
    while (value < len(xneu)):
        um = xneu[value]
        a = clf1.predict([um])
        aa = clf.predict_log_proba([um])
        hh = aa[0]
        hhh = hh[1]
        hhhh = hh[0]
        q11 = math.exp(hhh)
        q12 = math.exp(hhhh)
        if (q11 < q12):
            mm = q12 - q11
        else:
            mm = q11 - q12
        if (mm < 0.1):
            ytest1.append([4])
        else:
            ytest1.append(a)
        value = value + 1

    #print("svm")

    #print(len(ytest1))
    #print(ytest1)

    ###naive bayes

    xtrain3 = np.array(xtrain)
    clf2 = GaussianNB()
    clf2.fit(xtrain3, ytrain)
    value = 0
    while (value < len(xneu)):
        um = xneu[value]
        a = clf2.predict([um])
        ytest2.append(a)
        value = value + 1

    #print(" nb values")
    #print(ytest2)
    #print(len(ytest2))

    ##for all classifier if two values are same select those classifier

    finaltest = []
    i = 0
    length = len(ytest)
    while (i < length):
        if (ytest[i] == ytest1[i] and ytest[i] == ytest2[i]
                and ytest1[i] == ytest2[i]):
            finaltest.append(ytest[i])
        elif (ytest[i] == ytest1[i] and ytest[i] != ytest2[i]
              and ytest1[i] != ytest2[i]):
            finaltest.append(ytest[i])
        elif (ytest[i] != ytest1[i] and ytest[i] == ytest2[i]
              and ytest1[i] != ytest2[i]):
            finaltest.append(ytest[i])
        elif (ytest[i] != ytest1[i] and ytest[i] != ytest2[i]
              and ytest1[i] == ytest2[i]):
            finaltest.append(ytest1[i])
        else:
            yyyyy = 787878
        i = i + 1

# print("finaltest")
# print(finaltest)
# print(len(finaltest))

### after classifier the results

    ff1 = 0
    ff2 = 0
    ff3 = 0
    qq = 0
    numb = 0
    dic = {}
    data = []
    posafter = []
    negafter = []
    while (numb < len(n)):
        cleanedTweet = n[numb]
        url = urls[numb]
        time = times[numb]
        analysis = TextBlob(cleanedTweet)
        polarity = 'Positive'
        if (analysis.sentiment.polarity < 0):
            polarity = 'Negitive'
            ff1 = ff1 + 1
            negafter.append(cleanedTweet)
        elif (0 <= analysis.sentiment.polarity <= 0.2):
            if (finaltest[qq] == [4]):
                polarity = 'neutral'
                ff3 = ff3 + 1
            elif (finaltest[qq] == 3):
                polarity = 'Neg'
                ff1 = ff1 + 1
                negafter.append(cleanedTweet)
            elif (finaltest[qq] == 2):
                polarity = 'pos'
                ff2 = ff2 + 1
                posafter.append(cleanedTweet)

            qq = qq + 1

        else:
            ff2 = ff2 + 1
            posafter.append(cleanedTweet)
        numb = numb + 1
        dic = {}
        dic['Sentiment'] = polarity
        dic['Tweet'] = cleanedTweet
        dic['URL'] = url
        dic['Time'] = time
        dic['Sentiment Scores'] = analysis.sentiment
        data.append(dic)
        df = pd.DataFrame(data)
        df.to_csv('analysis2.csv')
    print("after claffication")
    print("postive tweets are", ff2)
    print("negative tweets are", ff1)
    print("neutral tweets are", ff3)
    i = 0
    text1 = ''

    while (i < len(posafter)):
        text1 = text1 + posafter[i]
        text1 = text1 + ". "
        i = i + 1

    i = 0
    text = ''
    while (i < len(negafter)):
        text = text + negafter[i]
        text = text + ". "
        i = i + 1

    #print("positive tweets text")
    #print(text1)
    #print("negtive tweets text")
    #print(text)

    # Used when tokenizing words
    sentence_re = r'''(?x)          # set flag to allow verbose regexps
            (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
          | \w+(?:-\w+)*        # words with optional internal hyphens
          | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
          | \.\.\.              # ellipsis
          | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
        '''

    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()

    #Taken from Su Nam Kim Paper...
    grammar = r"""
        NBAR:
            {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
            
        NP:
            {<NBAR>}
            {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
    """

    #negative tweets grammer
    chunker = nltk.RegexpParser(grammar)

    toks = nltk.regexp_tokenize(text, sentence_re)
    postoks = nltk.tag.pos_tag(toks)

    #   print(postoks)

    tree = chunker.parse(postoks)

    from nltk.corpus import stopwords
    stopwords = stopwords.words('english')

    def leaves(tree):
        """Finds NP (nounphrase) leaf nodes of a chunk tree."""
        for subtree in tree.subtrees(
                filter=lambda t: t.label() == 'NP'
        ):  #for subtree in tree.subtrees(filter = lambda t: t.node=='NP'):
            yield subtree.leaves()

    def normalise(word):
        """Normalises words to lowercase and stems and lemmatizes it."""
        word = word.lower()
        word = stemmer.stem(word)  #_word(word)
        word = lemmatizer.lemmatize(word)
        return word

    def acceptable_word(word):
        """Checks conditions for acceptable word: length, stopword."""
        accepted = bool(2 <= len(word) <= 40 and word.lower() not in stopwords)
        return accepted

    def get_terms(tree):
        for leaf in leaves(tree):
            term = [normalise(w) for w, t in leaf if acceptable_word(w)]
            yield term

    terms = get_terms(tree)
    '''
    for term in terms:
        for word in term:
            print(word+' ')
        print('')
    '''
    #postive tweets grammer
    chunker1 = nltk.RegexpParser(grammar)

    toks1 = nltk.regexp_tokenize(text1, sentence_re)
    postoks1 = nltk.tag.pos_tag(toks1)
    #print("positive grammer")
    #print(postoks1)

    tree1 = chunker.parse(postoks1)

    terms1 = get_terms(tree1)
    '''
    for term in terms1:
        for word in term:
            print(word+' ')
        print('')
    '''

    #rake for keywords for negative tweets

    r = Rake()

    #   print("cons scores")
    r.extract_keywords_from_text(text)
    consscores = r.get_ranked_phrases_with_scores()

    #   print(consscores)

    p1 = len(consscores)
    p2 = len(consscores[0])
    freshcons = []
    i = 0
    while (i < p1):
        p3 = consscores[i][1]
        p4 = consscores[i][0]
        h = p3.split()

        if (len(h) <= 3 and p4 != 1.0):
            freshcons.append(p3)
        i = i + 1


#    print(freshcons)

#rake for keywords for positive tweets

#    print("pros scores")
    r.extract_keywords_from_text(text1)
    proscores = r.get_ranked_phrases_with_scores()

    #print(proscores)

    p1 = len(proscores)
    p2 = len(proscores[0])
    freshpros = []
    i = 0
    while (i < p1):
        p3 = proscores[i][1]
        p4 = proscores[i][0]
        h = p3.split()
        if (len(h) <= 3 and p4 != 1.0):
            freshpros.append(p3)
        i = i + 1

    #print(freshpros)

    ##for converting both grammers to lower case

    yyy = [[x.lower() for x in line] for line in postoks]
    yyy1 = [[x.lower() for x in line] for line in postoks1]

    #rules select grammer==nnp and keyword for negative

    i = 0
    y = 0
    conlist = []
    p = 0
    while (i < len(freshcons)):
        h = freshcons[i].split()
        y = 0
        while (y < len(h)):
            if (h[y] == topicname):
                conlist.append(freshcons[i])
                y = y + 1
                break
            else:
                y = y + 1
        i = i + 1
    k = 0
    while (k < len(freshcons)):
        h = freshcons[k].split()
        p = 0
        while (p < len(yyy)):
            if h[0] in yyy[p]:
                if (len(h) == 3):
                    if (yyy[p][1] == 'nnp' or yyy[p + 1][1] == 'nnp'
                            or yyy[p + 2][1] == 'nnp'):
                        conlist.append(freshcons[k])
                        break
                elif (len(h) == 2):
                    if (yyy[p][1] == 'nnp' or yyy[p + 1][1] == 'nnp'):
                        conlist.append(freshcons[k])
                        break
                elif (len(h) == 1):
                    if (yyy[p][1] == 'nnp'):
                        conlist.append(freshcons[k])
                        break
                else:
                    pppp = 555555
                p = p + 1
            else:
                p = p + 1
        k = k + 1

    #print("negative tweets key words")
    #print(freshcons)
    print("conlist")
    print(conlist)

    #rules select grammer==nnp and keyword for positive

    i = 0
    y = 0
    prolist = []
    p = 0
    while (i < len(freshpros)):
        h = freshpros[i].split()
        y = 0
        while (y < len(h)):
            if (h[y] == topicname):
                prolist.append(freshpros[i])
                y = y + 1
                break
            else:
                y = y + 1
        i = i + 1
    k = 0
    while (k < len(freshpros)):
        h = freshpros[k].split()
        p = 0
        while (p < len(yyy1)):
            if h[0] in yyy1[p]:
                if (len(h) == 3):
                    if (yyy1[p][1] == 'nnp' or yyy1[p + 1][1] == 'nnp'
                            or yyy1[p + 2][1] == 'nnp'):
                        prolist.append(freshpros[k])
                        break
                elif (len(h) == 2):
                    if (yyy1[p][1] == 'nnp' or yyy1[p + 1][1] == 'nnp'):
                        prolist.append(freshpros[k])
                        break
                elif (len(h) == 1):
                    if (yyy1[p][1] == 'nnp'):
                        prolist.append(freshpros[k])
                        break
                else:
                    pppp = 555555
                p = p + 1
            else:
                p = p + 1
        k = k + 1

        #print("positive tweets key words")
        #print(freshpros)

        print("prolist")
        print(prolist)

        fh = open("output.txt", "w+")
        pros = open("proslist.txt", "w+")
        cons = open("conslist.txt", "w+")
        fh.write(str(pp3))
        fh.write("\n" + str(pp2))
        fh.write("\n" + str(pp1))
        fh.write("\n" + str(ff2))
        fh.write("\n" + str(ff1))
        fh.write("\n" + str(ff3))
        cons.write("\n" + str(conlist))
        pros.write("\n" + str(prolist))
        fh.close()
        pros.close()
        cons.close()

    # myAPI = "http://localhost/Twitter/set_return.php?PP3="+str(pp3)+"&PP2="+str(pp2)+"&PP1="+str(pp1)+"&ff2="+str(ff2)+"&ff1="+str(ff1)+"&ff3="+str(ff3)+"&conlist="+str(conlist)+"&prolist="+str(prolist)

    # print(myAPI)

    return pp3, pp2, pp1, ff2, ff1, ff3, conlist, prolist
def summarize_doc(content, length):
    r = Rake()
    r.extract_keywords_from_text(content)
    # summarized = r.get_ranked_phrases_with_scores()
    summarized = ' '.join(r.get_ranked_phrases()).split(' ')[:length]
    return summarized
 def this_filter(self, text, top=100):
     r = Rake()
     r.extract_keywords_from_text(text)
     phrases = r.get_ranked_phrases()
     most_common = phrases[0:top]
     return most_common
Beispiel #25
0
def extract_summary_and_keywords_from_pdf(articles_dict):
    for article_key, article_value in articles_dict.items():
        link = article_value
        with open(link, "rb") as f:
            pdf = pdftotext.PDF(f)
        article_text = ""

        for p in pdf:
            article_text += p

        raw_data = article_text
        # Removing Square Brackets and Extra Spaces
        article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
        article_text = re.sub(r'\s+', ' ', article_text)

        # Removing special characters and digits
        formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text)
        formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

        sentence_list = nltk.sent_tokenize(article_text)

        stopwords = nltk.corpus.stopwords.words('english')

        word_frequencies = {}
        for word in nltk.word_tokenize(formatted_article_text):
            if word not in stopwords:
                if word not in word_frequencies.keys():
                    word_frequencies[word] = 1
                else:
                    word_frequencies[word] += 1

        try:
            maximum_frequncy = max(word_frequencies.values())
        except Exception as e:
            continue

        for word in word_frequencies.keys():
            word_frequencies[word] = (word_frequencies[word] /
                                      maximum_frequncy)

        sentence_scores = {}
        for sent in sentence_list:
            for word in nltk.word_tokenize(sent.lower()):
                if word in word_frequencies.keys():
                    if len(sent.split(' ')) < 30:
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_frequencies[word]
                        else:
                            sentence_scores[sent] += word_frequencies[word]

        import heapq
        summary_sentences = heapq.nlargest(7,
                                           sentence_scores,
                                           key=sentence_scores.get)

        summary = ' '.join(summary_sentences)

        # Removing Square Brackets and Extra Spaces
        summary = re.sub(r'\[[0-9]*\]', ' ', summary)
        summary = re.sub(r'\s+', ' ', summary)

        # Removing special characters and digits
        summary = re.sub('[^a-zA-Z]', ' ', summary)
        summary = re.sub(r'\s+', ' ', summary)

        print(summary)
        print("----------------------------------------")
        print("----------------------------------------")
        print("----------------------------------------")
        print("----------------------------------------")

        from rake_nltk import Rake
        r = Rake()
        r.extract_keywords_from_text(raw_data)
        keywords_yay = r.get_ranked_phrases()
        print(keywords_yay)

        print("----------------------------------------")
        update_database_from_pdf(article_key, link, keywords_yay, summary)
inputfolderpath2 = "hdfs://richmond:53001/SampleInputs/keyword_input.csv"

schema2 = StructType([ \
    StructField("Keyword", StringType(), True), \
    StructField("RowId & Score", StringType(), True)])
inputfileRDD = sqlContext.read.format('com.databricks.spark.csv') \
    .options(header='true', inferschema='true', sep=",", multiLine = True, quote='"', escape='"') \
    .load(inputfolderpath2, schema = schema2).rdd.repartition(30)

textinputfile="/s/chopin/k/grad/deotales/Source-Recommendation-System/ExampleRun/diff_input.txt"
file1 = open(textinputfile,"r")
text = file1.read()
# text = str(text.encode('ascii', "ignore"))
file1.close() 
rake = Rake()
rake.extract_keywords_from_text(text)
keyphrases_w_scores = rake.get_ranked_phrases_with_scores()
keyphrases_w_scores = keyphrases_w_scores[0:len(keyphrases_w_scores)/2]
keyphrases = rake.get_ranked_phrases()

inputfileRDD = inputfileRDD\
    .flatMap(lambda row: match_phrases(row[0], row[1]))\
    .flatMap(lambda row: map_scored_ids(row[0], row[1]))\
    .reduceByKey(lambda a, b: (float(a))+(float(b)))\
    .top(15, key=lambda x: x[1])

# print(inputfileRDD.count())
id_list_w_scores = inputfileRDD
print(id_list_w_scores)
id_list = [x[0] for x in id_list_w_scores]
print(id_list)
# Skip header row
next(rows)

for row in rows:
    # Extract value from spreadsheet and save to variable
    db_id = row[0].value
    rs_num = row[1].value
    description = row[2].value

    r = Rake(
        min_length=2, max_length=3
    )  # Uses stopwords for english from NLTK, and all puntuation characters.

    soup = BeautifulSoup(description, 'html.parser')

    #print(soup.get_text())

    r.extract_keywords_from_text(soup.get_text())

    keywords = r.get_ranked_phrases(
    )  # To get keyword phrases ranked highest to lowest.

    #print(r.get_ranked_phrases_with_scores())
    worksheet.write(row1, col, db_id)
    worksheet.write(row1, col + 1, rs_num)
    worksheet.write(row1, col + 2, str(keywords))
    row1 += 1

workbook.close()
print('Spreadsheet Generated')
Beispiel #28
0
    director = director.replace(" ","")
    director = director.replace("-","")
    director = director.replace(".","")
    director = director.lower()
    director = director.split(',')
    attributes.extend(director)
    actor = row['Actors']
    actor = actor.replace(" ","")
    actor = actor.replace("-","")
    actor = actor.replace(".","")
    actor = actor.lower()
    actor = actor.split(',')
    attributes.extend(actor)
    plot = row['Description']
    r = Rake()
    r.extract_keywords_from_text(plot)
    key_words_dict_scores = r.get_word_degrees()
    attributes.extend(key_words_dict_scores.keys())

    finalatt = list(attributes)
    attr = ' '.join(finalatt)
    row['Attributes'] = attr
    attributes.clear()

#Vectorization
count = CountVectorizer()
count_matrix = count.fit_transform(df['Attributes'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
indi = pd.Series(df.Title)
indices = indi
Beispiel #29
0
# In[3]:

# create document matrix for manually classified data

classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
processed_documents = []

for _class in classes:
    query = "SELECT `title` FROM articles WHERE class = \"" + str(
        _class) + "\" AND sr_no < 901"
    cursor.execute(query)
    article_fetch = cursor.fetchall()
    processed_string = ""
    for article in article_fetch:
        r = Rake(language='english')
        r.extract_keywords_from_text(article[0])
        tags = r.get_ranked_phrases()
        for tag in tags:
            tokens = set(word_tokenize(tag))
            for token in tokens:
                curr_tag = stemmer.stem(token)
                if curr_tag not in processed_string:
                    processed_string += curr_tag + " "
    processed_documents.append(processed_string)

# In[3]:

processed_documents = []
processed_string = ""
deaths = "killed beaten death burned mortal"
r = Rake(language='english')
Beispiel #30
0
    'director', 'country', 'date_added', 'release_year', 'duration',
    'Unnamed: 12', 'show_id', 'rating'
],
                  inplace=True,
                  axis=1)

#netflix_data.isna().sum()

netflix_data.dropna(inplace=True)

netflix_data['Key_words'] = ""
netflix_data['Type'] = ""
for index, row in netflix_data.iterrows():
    description = row['description']
    r = Rake()
    r.extract_keywords_from_text(description)
    key_words_dict_scores = r.get_word_degrees()
    row['Key_words'] = list(key_words_dict_scores.keys())

netflix_data.drop(columns=['description'], inplace=True)

netflix_data['listed_in'] = netflix_data['listed_in'].map(
    lambda x: x.lower().split(','))
netflix_data['cast'] = netflix_data['cast'].map(lambda x: x.split(',')[:3])
# netflix_data['director'] = netflix_data['director'].map(lambda x: x.split(','))
#netflix_data

netflix_data.set_index('title', inplace=True)
#netflix_data.head()

netflix_data['bag_of_words'] = ''
Beispiel #31
0
def Main(request): 
    search_form = Search(request.POST)
    if search_form.is_valid(): 
        query= request.POST['search_form']
        try: 
            from googlesearch import search 
        except ImportError:  
            print("No module named 'google' found") 
            
        all_key_words = []
        results= []
        images= []
        request.session['search']=random.randint(0,100000) 
        for result in search(query, tld="COM", num=10, stop=10, pause=2): 
            # Ignore converting links from HTML
            #url=result
            print(result)
            results.append(result) 
            response = Request(result, headers={'User-Agent': 'Mozilla/5.0'})
            webContent = urlopen(response).read()
            h= html2text.HTML2Text() 
            h.ignore_links=  True 
            h.ignore_images= True 
            text= h.handle(unidecode(str(webContent,errors='ignore')))
            #print(re.sub('[*#@$-]','', unidecode(text)))  
            print('hello1')
        # nlp 
            text = re.sub("[^0-9a-zA-Z]+"," ",text)# removes non-alphanumeric characters
            r = Rake(max_length = 2,ranking_metric=Metric.WORD_DEGREE)
            r.extract_keywords_from_text(text)
            key_words = r.get_ranked_phrases()
            # print('key_words='+str(key_words))
            try: 
                all_key_words.append(key_words[0]) # num of keywords per query
            except: 
                pass
        print('hello2')
        print("all_kws",all_key_words) # takes two most relevent terms for each
        #image search
        d = webdriver.Chrome(executable_path='/home/alisher/Desktop/Projects/IB/Drivers/chromedriver')
        def waits(time,xpath): 
            try:
                element = WebDriverWait(d, time).until(
                    EC.presence_of_element_located((By.XPATH, xpath))
                )
            except: 
                print("error occured")
                pass 
        for num,word in enumerate(all_key_words):
            d.get('https://duckduckgo.com/?q='+word+'&t=h_&iax=images&ia=images') 
            waits(3,'/html/body/div[2]/div[3]/div/div/div[2]/div/div[1]/div[1]/span/img')
            img= d.find_element_by_xpath('/html/body/div[2]/div[3]/div/div/div[2]/div/div[1]/div[1]/span/img')
            src= img.get_attribute('src')
            images.append(src) 
        display={} 
        for i in range(10): 
            try: 
                display[results[i]]=images[i]
            except: 
                pass
        print(display)
        d.close() 
        print(results)
        print(images) 
        return render(request, 'results.html', {'display':display})

    return render(request, 'home.html', {'form': search_form})
Beispiel #32
0
    for filename in glob.glob(os.path.join(dataDirectory, '*.docx')):
        print(filename)
        filenames.append(filename)

    for filename in filenames:
        print("Reading file " + str(fileIndex) + " of " + str(len(filenames)))
        fileIndex += 1

        totalDescription += getText(filename)

    r = Rake()

    print("Extracting keywords...")

    r.extract_keywords_from_text(totalDescription)

    print("Getting ranked phrases")

    keywords = r.get_ranked_phrases_with_scores()

    df = pd.DataFrame(columns=['rank', 'keyword_set'])

    for pair in keywords:
        num = (len(df) + 1)
        df.loc[num] = pair

    dirtitle = 'KeywordExtraction.csv'

    df.to_csv(dirtitle, encoding='utf-8')
Beispiel #33
0
    'arabic.tagger',
    'stanford-postagger-full-2018-10-16/stanford-postagger.jar')
for tag in tagger.tag(text.split()):
    print(tag[1])

parser = SParse.StanfordParser(
    model_path='edu/stanford/nlp/models/lexparser/arabicFactored.ser.gz')
sentences = parser.raw_parse_sents(text.split('.'))
for line in sentences:
    for sentence in line:
        print(sentence)
        sentence.draw()

ner = Text(text)
for sent in ner.sentences:
    print(sent)
    for entity in sent.entities:
        print(entity.tag, entity)
    print('')

with open('ar_london.txt', encoding='utf-8') as f:
    london = f.read()
print(london[:100])

rake = Rake(stopwords=stopwords.words('arabic'),
            punctuations=',./:،؛":.,’\''.split(),
            language='arabic',
            max_length=15)
rake.extract_keywords_from_text(london)
for phrase in rake.get_ranked_phrases()[:5]:
    print(phrase)