def get_keywords(raw_text, stopwords_file): punctuation_exclude = set(string.punctuation) raw_text = ''.join(ch for ch in raw_text if ch not in punctuation_exclude) tokens = nltk.word_tokenize(raw_text) rake_object = rake.Rake(stopwords_file) text = '\t'.join(tokens) return rake_object.run(text)
def gettags(): text = request.args.get('text') text = text.replace("%22","") text = text.replace('"',"") text = text.replace("<"," ") text = text.replace("/"," ") text = text.replace("\n"," ") text = text.replace(">"," ") text = text.replace(" "," ") text = text.replace(">", " ") text = text.replace("<", " ") print text rake_object = rake.Rake("smartstoplist.txt") finalkeys = [] keywords = rake_object.run(text) print keywords for a in keywords: if ' ' not in a[0]: finalkeys.append(a) tags="" counter = 0 for i in xrange(len(finalkeys)): if i<3: tags += finalkeys[i][0]+" " else: break tags = tags.strip() print tags return tags
def generate_word_list(self): """ Generate mapping for Word List for all phrases """ for phrase in self.phrase_list: self.word_list[phrase] = RAKE.separate_words(phrase, 0)
def run(self, text): lemmatized_text = self.pre_process(text) sentence_list = RAKE.split_sentences(lemmatized_text) self.phrase_list = RAKE.generate_candidate_keywords( sentence_list, self.stop_words_pattern) self.generate_word_list() word_scores = self.calculate_word_scores() keyword_candidates = self.generate_candidate_word_scores(word_scores) sorted_keywords = sorted(keyword_candidates.items(), key=operator.itemgetter(1), reverse=True) return sorted_keywords
def classify_text(text): rake_object = rake.Rake("SmartStoplist.txt") keywords = rake_object.run(text) #print("Keywords:", keywords) #above code successfully executed... Below code is extension. Could contain errors. words = [] scr = [] for phrase in keywords: if phrase[1] > 0: words.append(phrase[0]) scr.append(phrase[1]) #print(words) simpletext = [] sometext = [] with open('train.csv', 'r') as simple: sometext = csv.reader(simple) for row in sometext: tblb = () tblb = (row[0], row[1]) simpletext.append(tblb) domainlist = [] score = [] for word in words: max = 0 v1 = word2vec(word) for d in simpletext: word2 = d[0] v2 = word2vec(word2) res = cosdis(v1, v2) if res > max: max = res index = simpletext.index(d) domain = simpletext[index][1] print(word + "----" + domain) if domain in domainlist: index1 = domainlist.index(domain) i = words.index(word) score[index1] = score[index1] + scr[i] #print(domain+'---'+str(score[index1])) else: domainlist.append(domain) i = words.index(word) score.append(scr[i]) #print(domain+"---"+str(score[i])) print(domainlist) return domainlist, score
def __init__(self, stop_words): """ :param stop_words: Stop Words List [Array] """ # We need to recalculate Stop words here also # since Rake internally stores result in __var which is annoying to access self.stop_words_pattern = RAKE.build_stop_word_regex(stop_words) self.word_list = dict() self.phrase_list = [] super(DRAKE, self).__init__(stop_words)
def __init__(self): self.rake = RAKE.Rake( os.path.dirname(os.path.abspath(__file__)) + '/../data/stopwords.csv') self.exceptions = [] with open(os.path.dirname(os.path.abspath(__file__)) + '/../data/exceptions.csv', mode='r') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: row = [el.lower() for el in row] self.exceptions.append(row)
def _rake_analysis(self, document, stop_words): rake_object = rake.Rake(stop_words) keywords = rake_object.run(document) return keywords
import RAKE.RAKE as rake import operator rake_object = rake.Rake( "C:\Users\Harshit Agarwal\Desktop\stackoverflow.com-Posts\smartstoplist.txt" ) text2 = "Natural language processing (NLP) deals with the application of computational models to text or speech data. Application areas within NLP include automatic (machine) translation between languages; dialogue systems, which allow a human to interact with a machine using natural language; and information extraction, where the goal is to transform unstructured text into structured (database) representations that can be searched and browsed in flexible ways. NLP technologies are having a dramatic impact on the way people interact with computers, on the way people interact with each other through the use of language, and on the way people access the vast amount of linguistic data now in electronic form. From a scientific viewpoint, NLP involves fundamental questions of how to structure formal models (for example statistical models) of natural language phenomena, and of how to design algorithms that implement these models. In this course you will study mathematical and computational models of language, and the application of these models to key problems in natural language processing. The course has a focus on machine learning methods, which are widely used in modern NLP systems: we will cover formalisms such as hidden Markov models, probabilistic context-free grammars, log-linear models, and statistical models for machine translation. The curriculum closely follows a course currently taught by Professor Collins at Columbia University, and previously taught at MIT." text = "<p>How can I monitor an SQL Server database for changes to a table without using triggers or modifying the structure of the database in any way? My preferred programming environment is <a href=\"http://en.wikipedia.org/wiki/.NET_Framework\">.NET</a> and C#.</p>\n\n<p>I'd like to be able to support any <a href=\"http://en.wikipedia.org/wiki/Microsoft_SQL_Server#Genesis\">SQL Server 2000</a> SP4 or newer. My application is a bolt-on data visualization for another company's product. Our customer base is in the thousands, so I don't want to have to put in requirements that we modify the third-party vendor's table at every installation.</p>\n\n<p>By <em>\"changes to a table\"</em> I mean changes to table data, not changes to table structure.</p>\n\n<p>Ultimately, I would like the change to trigger an event in my application, instead of having to check for changes at an interval.</p>\n\n<hr>\n\n<p>The best course of action given my requirements (no triggers or schema modification, SQL Server 2000 and 2005) seems to be to use the BINARY_CHECKSUM function in <a href=\"http://en.wikipedia.org/wiki/Transact-SQL\">T-SQL</a>. The way I plan to implement is this:</p>\n\n<p>Every X seconds run the following query:</p>\n\n<pre><code>SELECT CHECKSUM_AGG(BINARY_CHECKSUM(*)) FROM sample_table WITH (NOLOCK);\n</code></pre>\n\n<p>and compare that against the stored value. If the value has changed, go through the table row by row using the query</p>\n\n<pre><code>select row_id,BINARY_CHECKSUM(*) from sample_table WITH (NOLOCK);\n</code></pre>\n\n<p>and compare the returned checksums against stored values.</p>\n" text3 = "<p>Let's say I have a <code>DataTable</code> with a <code>Name</code> column. I want to have a collection of the unique names ordered alphabetically. The following query ignores the order by clause.</p>\n\n<pre><code>var names =\n (from DataRow dr in dataTable.Rows\n orderby (string)dr[\"Name\"]\n select (string)dr[\"Name\"]).Distinct();\n</code></pre>\n\n<p>Why does the <code>orderby</code> not get enforced?</p>\n" text = text.replace("<", " ") text = text.replace("/", " ") text = text.replace("\n", " ") text = text.replace(">", " ") text = text.replace(" ", " ") text = text.replace(">", " ") text = text.replace("<", " ") finalkeys = [] keywords = rake_object.run(text) for a in keywords: if ' ' not in a[0]: finalkeys.append(a) print "keywords: ", finalkeys
import RAKE.RAKE as rake import operator import json import re f1 = open('extratest/resultantPython.json','w') f = open('extratest/python10000.json','r') rake_object = rake.Rake("smartstoplist.txt") parsed_input = json.load(f) stopwords = {} f3 = open("smartstoplist.txt","r") f4 = open("extratest/fullfilePython.txt","w") for line in f3: stopwords[line[0:-1]]= 1 #print stopwords #print parsed_input results = parsed_input['results'] for elem in results: #print elem aa = elem source = aa['_source'] b = source body = b['@Body'] tags = b['@Tags'] tags = tags[1:].replace(">","") tags = tags.replace("<"," ") tags = tags.split() text = body text = re.sub('(<pre><code>)((.|\n)*?)(<\/code><\/pre>)','',text) text = re.sub('<.*?>', '', text) #print text text = text.replace("<"," ")