def __init__(self, words, is_binary=False): self._keywords = words self._stemmed_keywords = [] stemmer = PorterStemmer() for word in words: self._stemmed_keywords.append(stemmer.stem_word(word)) self._is_binary = is_binary
def get_stemmed_terms_list(doc, stem_words_map = None, stem_bigrams_map = None): ps = PorterStemmer() local_map = dict() word_list = [] clean_doc = [(w.strip()).lower() for w in doc.split() if len(w) in range(3,16)] filtered_words = [w.strip('.,;?!:)(#') for w in clean_doc if not w.strip('.,;?!:)(#') in stopwords.words('english')] for w in filtered_words: if w.isalpha(): w_temp = ps.stem_word(w) if stem_words_map is not None: if w_temp not in stem_words_map: stem_words_map[w_temp] = dict() stem_words_map[w_temp][w] = stem_words_map[w_temp].get(w, 0)+1 local_map[w_temp] = w word_list.append(w_temp) bigrams = nltk.bigrams(word_list) for b in bigrams: bigram_org = (local_map[b[0]],local_map[b[1]]) if stem_bigrams_map is not None: if b not in stem_bigrams_map: stem_bigrams_map[b] = dict() stem_bigrams_map[b][bigram_org] = stem_bigrams_map[b].get(bigram_org, 0)+1 return word_list, bigrams
def extract(self, line): """ find word pairs that co-occur and extract # of minimum distance word pairs in the line """ words = self.tokenize(line.lower()) count = 0.0 stemmer = PorterStemmer() bad_indices = [] you_indices = [] for i in range(len(words)): word = words[i] if word in self._youwords: you_indices.append(i) word = stemmer.stem_word(word) if word in self._stemmed_badwords or self.isWordPartOf(word,self._badwords): bad_indices.append(i) if not bad_indices or not you_indices: return [-1] else: #print line #print bad_indices #print you_indices distances = [] for bindex in bad_indices: for yindex in you_indices: distances.append(abs(bindex - yindex)) #print distances mn = min(distances) count = sum([1 for d in distances if d == mn]) #return [(count *1.0)* mn/len(line)] return [1]
def get_stemmed_terms_list(doc, stem_words_map=None, stem_bigrams_map=None): ps = PorterStemmer() local_map = dict() word_list = [] clean_doc = [(w.strip()).lower() for w in doc.split() if len(w) in range(3, 16)] filtered_words = [ w.strip('.,;?!:)(#') for w in clean_doc if not w.strip('.,;?!:)(#') in stopwords.words('english') ] for w in filtered_words: if w.isalpha(): w_temp = ps.stem_word(w) if stem_words_map is not None: if w_temp not in stem_words_map: stem_words_map[w_temp] = dict() stem_words_map[w_temp][w] = stem_words_map[w_temp].get(w, 0) + 1 local_map[w_temp] = w word_list.append(w_temp) bigrams = nltk.bigrams(word_list) for b in bigrams: bigram_org = (local_map[b[0]], local_map[b[1]]) if stem_bigrams_map is not None: if b not in stem_bigrams_map: stem_bigrams_map[b] = dict() stem_bigrams_map[b][bigram_org] = stem_bigrams_map[b].get( bigram_org, 0) + 1 return word_list, bigrams
def __init__(self, badwords,youwords): self._badwords = badwords self._stemmed_badwords = [] self._youwords = youwords self._part_of_badword = {} #cache of words that start or end with offensive content stemmer = PorterStemmer() for word in badwords: self._stemmed_badwords.append(stemmer.stem_word(word))
def _getFeatures(self, corpus): stemmer = PorterStemmer() tokens = corpus.split(" ") features = filter(lambda x: len(x) > 1, tokens) finalList = [] for feature in features : feature = re.sub("[^a-zA-Z0-9']", "", feature.lower()) finalList.append(stemmer.stem_word(feature)) return finalList
def __init__(self, wordlist1,wordlist2,mindist = 1,maxdist=100): self._wordlist = [] stemmer = PorterStemmer() self._mindistance = mindist self._maxdistance = maxdist for word1 in wordlist1: for word2 in wordlist2: word1 = stemmer.stem_word(word1) self._wordlist.append(word1 + word2) self._wordlist.append(word1 + "-" + word2) self._wordlist.append(word1)
def extract(self, line): words = self.tokenize(line.lower()) count = 0.0 stemmer = PorterStemmer() for word in words: word = stemmer.stem_word(word) if word in self._stemmed_keywords: count += 1 if self._is_binary: return [1] if count > 0 else [0] else: return [count]
def extract(self, line): """ find word pairs that co-occur and extract # of minimum distance word pairs in the line """ words = self.tokenize(line.lower()) stemmer = PorterStemmer() for i in range(len(words)): word = stemmer.stem_word(words[i]) if word in self._wordlist: return [1.0] return [0.0]
def getFeatures(self, corpus): stemmer = PorterStemmer() stems = FreqDist() onlyLettersNumbers = re.compile('[^a-zA-Z0-9%!]') corpus = onlyLettersNumbers.sub(' ', corpus.lower()) corpus = TreebankWordTokenizer().tokenize(corpus) count = 0 for word in corpus : if not stopwords.STOP_WORDS.get(word) and len(word.strip()) > 1 : stems.inc(stemmer.stem_word(word)) count += 1 if self.__maxFeatures > 0 and count >= self.__maxFeatures : break features = stems.samples() return features
from nltk.stem.porter import PorterStemmer import re NORMALIZE_TERM_REG = "[^a-zA-Z0-9 ]" porter = PorterStemmer() lower_term = lambda term: term.lower() terms_filter = lambda term: term !=None and len(term.strip()) != 0 normalize = lambda term : re.sub(NORMALIZE_TERM_REG, "", term) remove_space = lambda term: term.strip() stem_terms = lambda term: porter.stem_word(term) def split_and_normalize_terms(terms, pattern): new_terms = filter(terms_filter, terms) new_terms = split_terms(new_terms, pattern) new_terms = map(normalize, new_terms) new_terms = filter(terms_filter, new_terms) new_terms = map(remove_space, new_terms) return new_terms def filter_and_normalize_terms(terms): new_terms = filter(terms_filter, terms) new_terms = map(normalize, new_terms) new_terms = filter(terms_filter, new_terms) return new_terms def split_terms(terms, pattern): new_terms = [] for term in terms: splited_terms = re.split(pattern, term) new_terms.extend(splited_terms)
class Merger: SKOS = Namespace('http://www.w3.org/2004/02/skos/core#') EXA = Namespace('http://www.example.com/#') def __init__(self, files, verbose): """Create a new instance. init takes two paramters, **file** and **verbose**. **file** is a list of files that will become global to this class. **verbose** sets the level of logging. If set to none, the loglevel is ``warning`` and will never be called. Debug will set the loggerlevel to ``debug`` - this would get you a lot of information. File, screen and both will set the level to info and tell the logger where the output should go. The logfile is called ``parserLog.txt``. init also creates a resultgraph, that contains all thesauri and their mappings. :param files: list of thesauri :type files: list :param verbose: A string of (screen, file, both, none, debug) :type verbose: string """ if verbose not in ('screen', 'file', 'both', 'none', 'debug'): sys.exit('Value of verbose must be screen, file, both, debug' ' or none') # Creating a logging instance and # enable different levels based on verbose self.logger = logging.getLogger() if verbose == 'none': self.logger.setLevel(logging.WARNING) elif verbose == 'debug': self.logger.setLevel(logging.DEBUG) else: self.logger.setLevel(logging.INFO) self.formatter = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') self.log(output=verbose) self.logger.debug('Logger created') self.logger.debug('Initalizing global variables') #self.files = files # List of input files self.porter = PorterStemmer() self.result = ConjunctiveGraph() # Where it ends self.graphlist = self.parseGraphs(files) self.mergeFiles(self.graphlist) self.logger.debug('Got %s inputfiles' % (len(files))) # self.graph = {} # contains the parsed input files self.logger.info('Merger initiated') self.addContext() self.reporting = {} self.reporting['equals'] = 0 self.reporting['substrings'] = 0 self.reporting['phrase'] = 0 self.reporting['related'] = 0 def log(self, output='both'): """this method creates a filehandle and a screenhandle for the logger. Depending on the output variable, it will call logToFile, to create the filehandle or logToScreen, to create the screenhandle, or both. If output is debug, log creates two handles as well. If output is none, nothing happens. Expected parameter: file, screen, both, debug, none""" def logToFile(): """Create a filehandle for logging. The file is called parserLog.txt. Loglevel is set to debug, so both, info and debug will be written. TODO: User gets to decide filename and location """ fh = logging.FileHandler('parserLog.txt') fh.setLevel(logging.DEBUG) fh.setFormatter(self.formatter) self.logger.addHandler(fh) def logToScreen(): """Create screenhandle for logging. All logs get written onto screen. Loglevel is set to debug, so both, info and debug will be written. TODO: redundant? """ scr = logging.StreamHandler() # Print on screen scr.setLevel(logging.DEBUG) scr.setFormatter(self.formatter) self.logger.addHandler(scr) if output == 'both' or 'debug': logToFile() logToScreen() if output == 'file': logToFile() if output == 'screen': logToScreen() def parseGraphs(self, files): """This Method takes the input file names and parses them. return: graphs - a list of ConjunctiveGraphs """ self.logger.info('Parsing the input files: %s' % (files)) graphs = [] for i in range(len(files)): graphs.append(ConjunctiveGraph('IOMemory')) for i in range(len(files)): graphs[i].parse(files[i], format='xml') self.logger.debug('Graphlist created. Length is %s' % (len(graphs))) return graphs def mergeFiles(self, graphs): """Calls the addContent-method on each graph in self.graph dictionary. This will write all graphs into one graph (the resultgraph) """ self.logger.info('Merging inputfiles into resultgraph...') for i in range(len(graphs)): self.addContent(graphs[i]) def addContext(self): """ Adding the namespacebinding for SKOS and a custom namespace EXA. TODO: Own Method for custom NS binding? """ self.result.bind('skos', Merger.SKOS) self.result.bind('exa', Merger.EXA) self.addToResult(Merger.EXA.distantMatch, RDF.type, OWL.ObjectProperty) def getLabels(self, graph): """getLabels takes a ConjunctiveGraph instance and finds all SKOS:prefLabel and rdfs:label. It returns an dictionary with {uri:label}. param: graph - a ConjunctiveGraph instance return: compict - a dictionary of all {uri:label} for a graph """ compdict = {} self.logger.info('Getting labels from %s' % (graph)) if (None, Merger.SKOS.prefLabel, None) in graph: for uri, label in graph.subject_objects(Merger.SKOS.prefLabel): compdict[uri] = label.toPython().strip().lower() if (None, URIRef("http://www.w3.org/2000/01/rdf-schema#label"), None) in graph: for uri, label in graph.subject_objects( URIRef("http://www.w3.org/2000/01/rdf-schema#label")): compdict[uri] = label.toPython().strip().lower() return compdict def removeDiacritics(self, label): """ This method uses unicodedata() to remove diacritics from a string TODO: Does this work without unicodedata? param: string return: string """ label = ''.join((c for c in unicodedata.normalize('NFD', unicode(label)) if unicodedata.category(c) != 'Mn')) return label def removePunctuation(self, label): """This method removes punctuations. Right now, it will remove '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' param: string return: string """ for punct in string.punctuation: label = label.replace(punct," ") return label def stemWords(self, label): """This method stems a single word or a phrase param: string return: string """ label = " ".join(self.porter.stem_word(word) for word in label.split(" ")) return label def __broaders(self, uri, graph): """This "private" method is a generator over broaderTerms of a given URI in a given ConjunctiveGraph. Use getParents() to obtain the list of broaderTerms """ for n in graph.transitive_objects(URIRef(uri), URIRef('http://www.w3.org/2004/02/skos/core#broader')): if (uri==n): continue yield n def __narrowers(self, uri, graph): """This "private" method is a generator over narrowerTerms of a given URI in a given ConjunctiveGraph. Use getchildren() to obtain the list of narrowerTerms """ for n in graph.transitive_objects(URIRef(uri), URIRef('http://www.w3.org/2004/02/skos/core#narrower')): if (uri==n): continue yield n def getParents(self, uri, graph): list = [] for n in self.__broaders(uri, graph): for label in graph.objects(n, URIRef("http://www.w3.org/2004/02/skos/core#prefLabel")): list.append(label.toPython().strip().lower()) return list def getChildren(self, uri, graph): list = [] for n in self.__narrowers(uri, graph): for label in graph.objects(n, URIRef("http://www.w3.org/2004/02/skos/core#prefLabel")): list.append(label.toPython().strip().lower()) return list def isSameTerm(self, label1, label2): """returns true if two items are equal params: string """ if label1 ==label2: self.logger.debug('Identical terms found %s - %s' % (label1, label2)) return True def isSubstring(self, label1, label2): """returns true if one item is a substring of the other. params: string """ if len(label1.split(' ')) == 1 and len(label2.split(' ')) == 1: # if both split-operations have returned # one word (were not successful) if (label1 in label2 or label2 in label1): self.logger.debug('Substring found for %s and %s' % (label1, label2)) return True def isPhrase(self, label1, label2): """returns true if one item is a phrase containing the other item. params: string """ if len(label1.split(' ')) > 1 or len(label2.split(' ')) > 1: # if one of the lists holds more than one word (is a phrase) for word in label1.split(' '): for word2 in label2.split(' '): if word == word2: return True def termSignatur(self, label): """This method will return the term signatur of phrase. It stems and sort the phrase. param: string return: string """ return ''.join(sorted(self.porter.stem_word(word) for word in label.split(" "))) def isSameSig(self, label1, label2): if len(label1.split(' ')) > 1 and len(label2.split(' ')) > 1: label1 = self.termSignatur(label1) label2 = self.termSignatur(label2) if label1 == label2: return True def isConceptScheme(self, uri): """returns true if a uri is a SKOS:ConceptScheme TODO: Method for custom "type"-checking """ i = 0 for triple in self.result.triples((uri, RDF.type, Merger.SKOS.ConceptScheme)): i += 1 if i > 0: self.logger.debug('Is a ConceptScheme: %s' % (uri)) return True return False def addEquals(self, uri1, uri2): """addEquals takes two URIs and adds them to the resultgraph. if one of the URIs is a SKOS:ConceptScheme, the predicate will be EXA:distantMatch instead of SKOS:CloseMatch. """ self.reporting['equals'] += 1 if self.isConceptScheme(uri1) or self.isConceptScheme(uri2): self.logger.debug('Adding equal Concept - ConcepScheme relation') self.addToResult(uri1, Merger.EXA.distantMatch, uri2) else: self.logger.debug('Adding equal terms') self.addToResult(uri1, Merger.SKOS.closeMatch, uri2) def addRelated(self, uri1, uri2): """addEquals takes two URIs and adds them to the resultgraph. if one of the URIs is a SKOS:ConceptScheme, the predicate will be EXA:distantMatch instead of SKOS:CloseMatch. """ self.reporting['related'] += 1 if self.isConceptScheme(uri1) or self.isConceptScheme(uri2): self.logger.debug('Adding equal Concept - ConcepScheme relation') self.addToResult(uri1, Merger.EXA.distantMatch, uri2) else: self.logger.debug('Adding equal terms') self.addToResult(uri1, Merger.SKOS.semanticRelation, uri2) def addSubstrings(self, uri1, uri2): """addSubstrings takes two URIs and adds them to the resultgraph. if one of the URIs is a SKOS:ConceptScheme, the predicate will be EXA:distantMatch instead of SKOS:relatedMatch. """ self.reporting['substrings'] += 1 if self.isConceptScheme(uri1) or self.isConceptScheme(uri2): self.logger.debug('Adding substr Concept - ConcepScheme relation') self.addToResult(uri1, Merger.EXA.distantMatch, uri2) else: self.logger.debug('Adding substrings') self.addToResult(uri1, Merger.SKOS.relatedMatch, uri2) def addPhrase(self, uri1, uri2): """addPhrase takes two URIs and adds them to the resultgraph. if one of the URIs is a SKOS:ConceptScheme, the predicate will be EXA:distantMatch instead of SKOS:relatedMatch. """ self.reporting['phrase'] += 1 if self.isConceptScheme(uri1) or self.isConceptScheme(uri2): self.addToResult(uri1, Merger.EXA.distantMatch, uri2) self.logger.debug('Adding phrase Concept - ConcepScheme relation') else: self.logger.debug('Adding phrase') self.addToResult(uri1, Merger.SKOS.relatedMatch, uri2) def addToResult(self, s, p, o): """Adds a triple to the resultgraph""" self.result.add((s, p, o)) def writeToFile(self, dest, ext): """Serializes the resultgraph into a file TODO: custom filename and format """ self.logger.info('Writing output...') self.result.serialize(destination=dest, format=ext) self.logger.info('Done.') def addContent(self, graph): """Adds all triples from a graph to the resultgraph""" for s, p, o in graph: self.addToResult(s, p, o)
class DateExtractor(object): """Module for extracting date expressions from text. Examples of recognized dates: - 10 days - ten days - 11/10/2106 - 12 months - one year. """ GRAMMAR = r""" DATE: {<JJ|NN|NNP|CD|VB><\(><CD><\)><VBG|NN><NNS|NN|JJ>} # thirty (30) working|business days {<JJ|NN|NNP|CD|VB><\(><CD><\)><NNS|NN|JJ>} # thirty (30) days {<CD><NN><NNS|NN>} # thirty business days {<CD><NN|NNS|JJ>} # 10 days, 1 year """ def __init__(self): super(DateExtractor, self).__init__() self.parser = nltk.RegexpParser(self.GRAMMAR) self.stemmer = PorterStemmer() def extract_dates(self, text): sentences = self._get_sentences(text) tagged_sentences = [nltk.pos_tag(sent) for sent in sentences] result = [] for sentence in tagged_sentences: tree = self.parser.parse(sentence) for expression in self._extract_data_from_tree(tree): if not expression or self._is_false_positive(expression, sentence): continue expression = self._extend_to_left(expression, sentence) result.append(expression) return result def _extend_to_left(self, expression, tagged_sentence): """Try to complete the first numeral. It's a healing method for the next scenario: seventy two (72) days. """ num_text = self._extract_number_from_expression(expression) if not num_text: return expression if expression.startswith(num_text): # Expression is correctly formatted return expression # If the text would have contained '-' the expression would have been correct. num_words = num_text.split('-') if not expression.lower().startswith(num_words[-1]): print 'Expression %s does not start with the expected word %s' % (expression, num_words[-1]) return expression sentence = join_sentence([t[0] for t in tagged_sentence]) # Last word is in the expression, the other words we expect to find in the subsentence. # The fact that there can be more such expressions in a sentence was considered but we # should not do anything special for them because agreements have a pseudo structure which # leads to consistency in terms of formats. idx = sentence.find(expression) subsentence = sentence[:idx].strip().split() subsentence.reverse() num_words = num_words[:-1] num_words.reverse() idx = 0 wc = len(num_words) while idx < wc and num_words[idx] == subsentence[idx]: expression = '%s %s' % (num_words[idx], expression) idx += 1 return expression def _extract_number_from_expression(self, expression): number_search = re.search('\(([0-9]+)\)', expression) if not number_search: return None try: number = int(number_search.groups()[0]) except: print "Could not extract number from expression: %s" % expression return None return num2words(number) def _extract_data_from_tree(self, tree): expressions = [] for subtree in tree.subtrees(): if not subtree.label() == 'DATE': continue expressions.append(join_sentence([t[0] for t in subtree.leaves()])) return expressions def _get_sentences(self, text): sentences = nltk.sent_tokenize(text) # Remove new lines sentences = [s.replace('\r\n', ' ') for s in sentences] # Collapse whitespaces rex = re.compile(r'[ \t]+') sentences = [rex.sub(' ', s) for s in sentences] sentences = [nltk.word_tokenize(sent) for sent in sentences] return sentences def _is_false_positive(self, expression, tagged_sentence): # The last token should be either time unit or 'period' time_unit = expression.split()[-1] stem = self.stemmer.stem_word(time_unit) if stem not in ALLOWED_STEMS: return True if stem == YEAR_STEM: # Check if the sentence represents an age expression (it is followed by "of age" # or "old"). sentence = join_sentence([t[0] for t in tagged_sentence]) idx = sentence.find(expression) + len(expression) subsentence = sentence[idx:].strip() # Note: This check may fail if in the same sentence there are both age expressions and # date expressions in years. This should not be a problem since no reviewed document # has this case (it also doesn't make sense in the pseudo-structure of agreements). if any(subsentence.startswith(expr) for expr in AGE_EXPRESSIONS): return True return False
class BooleanSearch(object): """ This class handles the parsing and execution of queries from a provided query file based on the index and postings file provided upon initialization. Results are saved into an output file. Parsing of queries is handled by converting the boolean expressions into an equivalent python expression that is executable. """ eval_index_local = "index" eval_globals = {"__builtins__": None} replacements = [("AND NOT", "-"), ("AND", "&"), ("OR", "|"), ("NOT", "~"), ("(", " ( "), (")", " ) ")] exprs = set(["-", "&", "|", "~", "(", ")"]) expr_postings_ref = "index[\"%s\"]" def __init__(self, index_filename, postings_filename): """ index_filename refers to the dictionary file. postings_filename refers to the postings file. """ self.stemmer = PorterStemmer() self.index = Index(index_filename, postings_filename) self.eval_locals = {self.eval_index_local: self.index} def _to_python_expression(self, query): """ Parses a boolean expression by converting the boolean operator keywords into python's bitwise operators, and converts the terms into their respective index calls that return SkipList objects. The resulting expression is an executable python expression. WARNING: NOT SAFE FOR PRODUCTION SYSTEMS. FOR ACADEMIC PURPOSES ONLY. """ query = reduce(lambda q,args: q.replace(*args), self.replacements, query) query_list = [x not in self.exprs and self.expr_postings_ref % self.stemmer.stem_word(x.lower()) or x for x in query.split()] return " ".join(query_list) def _execute_query(self, query): """ Executes the provided query and returns the result WARNING: NOT SAFE FOR PRODUCTION SYSTEMS. FOR ACADEMIC PURPOSES ONLY. """ expression = self._to_python_expression(query) try: result = eval(expression, self.eval_globals, self.eval_locals) except SyntaxError as se: return "Syntax Error occurred, possible malformed expression during conversion: %s" % expression except NameError as ne: return "Name Error occured, possible invalid object reference in query: %s" % expression else: return result def process_queries(self, query_filename, output_filename): """ This method takes in a query filename and output filename. For every query, it writes the output into a new line. """ try: with open(query_filename, 'r') as query_file, open(output_filename, 'w') as output_file: for row in query_file: result = self._execute_query(row) output_file.write(str(result) + "\n") except IOError as error: print "IO Error occured while attempting to run BooleanSearch" sys.exit(error.args[1])
if(word!=False): fullSearchURL=baseSearchURL+word f = urllib.urlopen(fullSearchURL) for line in f.read().split("\n"): found=False if("video src" in line): endURL=line[12:] endURL=endURL[:endURL.index("\"")] fullURL=baseURL+endURL print 'getting video from'+fullURL found=True break if(found): urllib.urlretrieve(fullURL,word+'.mp4') else: p = PorterStemmer() word=PorterStemmer.stem_word(p,origword) fullSearchURL=baseSearchURL+word f = urllib.urlopen(fullSearchURL) for line in f.read().split("\n"): found=False if("video src" in line): endURL=line[12:] endURL=endURL[:endURL.index("\"")] fullURL=baseURL+endURL print 'getting video from'+fullURL found=True break if(found): urllib.urlretrieve(fullURL,word+'.mp4')
def get_word_counts( solr, fq, query, num_words, field='sentence' ) : print query print str(time.asctime()) start_time = time.time() function_start_time = start_time results = fetch_all( solr, fq, query, 'sentence' ) print "got " + query print len( results ) print time.asctime() end_time = time.time() print "time {}".format( str(end_time - start_time) ) start_time = end_time print 'converting to utf8 and lowercasing'; sentences = [ result['sentence'].lower() for result in results ] results = None end_time = time.time() print "time {}".format( str(end_time - start_time) ) start_time = end_time print 'calculating non_stemmed_wordcounts' term_counts = non_stemmed_word_count( sentences ) if '' in term_counts: del term_counts[''] print "Returned from non_stemmed_word_count" print time.asctime() end_time = time.time() print "time {}".format( str(end_time - start_time) ) start_time = end_time print "freeing sentences " sentences = None end_time = time.time() print "time {}".format( str(end_time - start_time) ) start_time = end_time print 'stemming and counting' stem_counts = collections.Counter() st = PorterStemmer() for term in term_counts.keys(): #ipdb.set_trace() stem = st.stem_word( term ) stem_counts[ stem ] += term_counts[ term ] end_time = time.time() print "done stemming and counting " print "time {}".format( str(end_time - start_time) ) start_time = end_time print ' calcuating stem to term map ' stem_to_terms = {} for term in term_counts.keys(): stem = st.stem_word( term ) if stem not in stem_to_terms: stem_to_terms[ stem ] = [] stem_to_terms[stem].append( term ) print "done calcuating stem to term map " print "time {}".format( str(end_time - start_time) ) counts = stem_counts.most_common( num_words ) ret = [ ] for stem, count in counts: if len( stem_to_terms[ stem ] ) < 2: term = stem_to_terms[ stem][0] else: best_count = 0 for possible_best in stem_to_terms[ stem ] : if term_counts[ possible_best ] > best_count: term = possible_best best_count = term_counts[ possible_best ] ret.append( { 'stem': stem, 'term': term, 'count': count } ) end_time = time.time() print "total time {}".format( str(end_time - function_start_time) ) return ret
db = c['haiku'] # Open connection to Twitter's public timeline, build haikus failed = True while failed: failed = False try: with tweetstream.SampleStream(USER,PASS) as stream: for tweet in stream: if 'text' in tweet and len(tweet['text'])>0: screen_name = tweet['user']['screen_name'] hashes = [j for j in set([i for i in tweet['text'].split() if i.startswith('#')])] # Strip out urls, punctuation, RTs, and @'s tweet_stripped = urlre.sub('',tweet['text']) tweet_stripped = punctre.sub('',tweet_stripped) tweet_stemmed = [porter_stemmer.stem_word(i.lower()) for i in tweet_stripped.split()] # Keep unstemmed, stripped tweet for either storage or retweeting tweet_outgoing = [i.lower() for i in tweet_stripped.split()] # hack to make sure that only coherent tweets are passed through temp_tweet = [i.lower() for i in tweet_stemmed if not i.lower().startswith('rt')] tweet_for_topic = [i.lower() for i in tweet_stemmed if not i.lower().startswith('rt') and nsyl(i)>0 and i.lower() not in stopwords] if tweet_for_topic==temp_tweet and len(tweet_for_topic)>0: print 'Iteration '+str(counter) #Assign this tweet a topic docset = [] docset.append(' '.join(i for i in tweet_for_topic)) print 'Tweet: '+docset[0] (gamma, bound) = olda.update_lambda(docset) counter+=1 if (counter % 100 == 0): numpy.savetxt('lambdas/lambda-%d.dat' % counter, olda._lambda)
class QueryHandler(index_handler.IndexHandler): ''' A class that handles queries upon our INDEX. It provides functions to deal with boolean retrieval or vector space model retrieval. ''' def __init__(self, **kwargs): index_handler.IndexHandler.__init__(self, **kwargs) self.limit = kwargs.get('limit',10) self.query = "" self.filters = set() self.known_filters = FILTERS self.debug = kwargs.get('debug',True) self.stemmer = PorterStemmer() self.res_cache_db = kwargs.get('res_cache_db',None) self.res_cache_exp = kwargs.get('res_cache_exp',100) self.serializer = serializer self.tfidf_w = kwargs.get('tfidf_w',0.33) self.title_w = kwargs.get('title_w',0.33) self.posting_w = kwargs.get('posting_w',0.33) def clear(self): self.filters = set() self.query = "" def process_query(self, query): ''' entry point for query processing ''' self.clear() initial_query = query self.query = query if self.debug: print "INITIAL QUERY:", initial_query self.apply_filters() self.clean_stem_query() if len(self.query.split()) == 1: res = self.exec_single_query(self.query) elif "title_only" in self.filters: res = self.get_titles(self.query.split()) else: weighted_terms = self.filter_query() res = self.vector_retrieval(weighted_terms) if res: external_ids = self.resolve_external_ids([i[0] for i in res]) res = [(external_ids[i], res[i][1]) for i in xrange(len(res))] if self.res_cache_db: try: self.res_cache_db.set(initial_query, self.serializer.dumps(res)) except: raise Exception, "CACHING SEARCH RESULT FAILED, UNREACHABLE DB" return res def apply_filters(self): for i in self.known_filters: if re.search(self.known_filters[i], self.query.split()[-1]): self.filters.add(i) self.query = re.sub(self.known_filters[i],"",self.query) if self.debug: print "WITH FILTERS:", self.filters if not len(self.filters): self.filters.add("complete") def clean_stem_query(self): q = "" for token in re.sub(r"[.,:;\-!?\"']", " ", self.query).split(): try: lower = token.lower() if stringcheck.check(lower): q += self.stemmer.stem_word(lower) + " " except: if self.debug: print "Probable unicode error in stemming query" self.query = q if self.debug: print "STEMMED QUERY:", self.query def filter_query(self): ''' Discovers document frequencies of query terms Returns a list of tuples of all terms that appear in the index Format = (term,df) ''' return self.get_dfs(self.query.split()) def exec_single_query(self, query): ''' optimized for a single query ''' if self.debug: print "In exec single query" if "title_only" in self.filters: return [(i,1) for i in self.db.smembers("T%s"%query.strip())] elif "pure_tfidf" in self.filters: return self.db.zrevrange(query.strip(), 0, self.limit - 1 , withscores=True) else: q = query.strip() res = self.db.zrevrange(q, 0, self.limit - 1 , withscores=True) dids = list([i[0] for i in res]) title_rank = self.get_title_hit([q], dids) new_doc_ids = [] for i, stuff in enumerate(res): new_doc_ids.append( (stuff[0], self.weighted_ranking(tfidf=stuff[1], title=title_rank[i])) ) if self.debug: print "RESULTS " , sorted(new_doc_ids, key=operator.itemgetter(1), reverse=True) return sorted(new_doc_ids, key=operator.itemgetter(1), reverse=True) def get_titles(self, term_list): docs = list(self.db.sinter(["T%s"%term for term in term_list])) if docs: for term in term_list: self.pipe.hmget("&T%s"%term, docs) ranked = [] for i, v in enumerate(itertools.izip_longest(*self.flush())): score = 0 for j in xrange(len(v) - 1): score += 1.0/(float(v[j+1]) - float(v[j])) ranked.append((docs[i], score)) return sorted(ranked, key=operator.itemgetter(1), reverse=True) return [] def vector_retrieval(self, weighted_terms): ''' A function to start vector space model retrieval Intersects all docIDs for every term in term_list Returns sorted tfidf-weighted docids ''' if self.debug: print "performing vector retrieval on " , weighted_terms terms = [i[0] for i in weighted_terms] query_key = "".join(terms) self.pipe.zinterstore(query_key, dict(weighted_terms)) self.pipe.zrevrange(query_key, 0, self.limit - 1 , withscores=True) doc_ids = self.flush()[1] if not len(doc_ids): return None return self.rank_results(doc_ids, terms) def rank_results(self, doc_ids, terms): if "pure_tfidf" in self.filters: if self.debug: print "RESULTS ", doc_ids return doc_ids elif "complete" in self.filters: dids = list([i[0] for i in doc_ids]) # rank by title title_rank = self.get_title_hit(terms, dids) # must do proximity ranking # get the posting lists sh = self.get_postings(terms, dids) # actually, I wanted to name this "shit" posting_rank = [] for v in itertools.izip_longest(*sh): # decompose list of lists try: posting_rank.append( ( self.proximity_rank( self.unfold_postings([ [int(k) for k in j.split(",")] for j in v]) ) ) ) except: pass new_doc_ids = [] for i, stuff in enumerate(doc_ids): new_doc_ids.append( (stuff[0], self.weighted_ranking(tfidf=stuff[1], title=title_rank[i], posting=posting_rank[i] )) ) if self.debug: print "RESULTS " , sorted(new_doc_ids, key=operator.itemgetter(1), reverse=True) return sorted(new_doc_ids, key=operator.itemgetter(1), reverse=True) ############################################################################################################# # RANKING FUNCTIONS ############################################################################################################# def weighted_ranking(self, **kwargs): ''' kwargs carry the scores to be multiplied ''' tfidf = kwargs.get('tfidf', 0) title = kwargs.get('title', 0) posting = kwargs.get('posting', 0) return tfidf*self.tfidf_w + title*self.title_w + posting*self.posting_w def proximity_rank(self, list_of_lists): ''' A ranking function that calculates a score for words' proximity. This score is defined as the sum of 1/Prox for every continuous matches of them. Prox is a number indicating how close the words are example: for words A and B, their postings are [1,4,10] and [2,6,17] then score = 1/(2 - 1 + 1) + 1/(6 - 4 + 1) + 1/(17 - 10 + 1) ''' def sub(*args): return reduce(lambda x, y: y-x, args ) _len = len(list_of_lists) - 1 # add padding to shorter lists biggest = max([len(i) for i in list_of_lists]) for i in list_of_lists: while len(i) != biggest: i.insert(0,i[0]) score = 0 while True: try: # get all heads _tuple = [i.pop(0) for i in list_of_lists] for i in xrange(1,len(_tuple)): # ensure we keep order of postings while _tuple[i] - _tuple[i-1] < 0: _tuple.pop(i) _tuple.insert(i, list_of_lists[i].pop(0)) score_vector = [i - _len for i in map(sub, _tuple)] #print _tuple , score_vector[-1] - score_vector[0] - _len + 1 score += 1.0/(score_vector[-1] - score_vector[0] - _len + 1) # ensure no division with 0 except: break return score ############################################################################################################# # HELPER FUNCTIONS ############################################################################################################# def unfold_postings(self, list_of_lists): ''' reverses gap encoding ''' new_list_of_lists = [] for _list in list_of_lists: nlist = [] pos = 0 for p in _list: pos += p nlist.append(pos) new_list_of_lists.append(nlist) return new_list_of_lists
try: with tweetstream.SampleStream(USER, PASS) as stream: for tweet in stream: if 'text' in tweet and len(tweet['text']) > 0: screen_name = tweet['user']['screen_name'] hashes = [ j for j in set([ i for i in tweet['text'].split() if i.startswith('#') ]) ] # Strip out urls, punctuation, RTs, and @'s tweet_stripped = urlre.sub('', tweet['text']) tweet_stripped = punctre.sub('', tweet_stripped) tweet_stemmed = [ porter_stemmer.stem_word(i.lower()) for i in tweet_stripped.split() ] # Keep unstemmed, stripped tweet for either storage or retweeting tweet_outgoing = [ i.lower() for i in tweet_stripped.split() ] # hack to make sure that only coherent tweets are passed through temp_tweet = [ i.lower() for i in tweet_stemmed if not i.lower().startswith('rt') ] tweet_for_topic = [ i.lower() for i in tweet_stemmed if not i.lower().startswith('rt') and nsyl(i) > 0 and i.lower() not in stopwords
wouldn't you you'd you'll you're you've your yours yourself yourselves""".translate(table,punc).split()) for x in nltk.corpus.words.words(): stopwords.add(x.translate(table,punc)) stopwords = set([stemm.stem_word(x) for x in stopwords]) #get a list of the normalized words from the tweet def split_tweet(m): text = str(m['text']).translate(table, punc) #repattern.sub('', return set([x for x in [stemm.stem_word(x.lower()) for x in text.split() if x.lower().isalpha() and len(x) > 2 and not '#' in x and not 'http' in x] ]) #get hashtags def get_tags(m): return [stemm.stem_word(x['text'].lower()) for x in m['entities']['hashtags']] english_vocab = set(stemm.stem_word(w.lower()) for w in nltk.corpus.words.words()) print 'english parsed' #text_vocab = set(w.lower() for w in text if w.lower().isalpha()) #loop over all tweets found in files on @path
class rule_engine: def __init__(self): ## nltk stemmer self.stemmer = PorterStemmer() self.state_stack = [] self.stack = [] ## local variables self.Grounds = Groundings() self.Groundings = {} self.Types = {} def initialize(self, univ_sentence): self.tag_stack = univ_sentence.features self.hypergraph = univ_sentence.hypergraph def interpreter(self, FileList): vm = VM('r5rs') primitive_procedures = [ ["match-rule?", self.matchRule], ["rule-applied", self.rule_applied], ["lemma", self.lemma], ["in-sentence?", self.inSentence], ["make-link", self.addLink], ["reset-scope", self.Grounds.variableScope], ["ground", self.Grounds.groundVariable], ["set-link-type", self.setType], ["make-phrase", self.makePhrase], ["output-phrase", self.Output], ["get-groundings", self.getGroundings], ["has-feature?", self.hasFeature], ["has-flag?", self.hasFlag], ] for name, procedure in primitive_procedures: vm.define(name, vm.toscheme(procedure)) for File in FileList: vm.load(File) def run_rules(self): files = ['analysis/prep-rules.scm', 'analysis/triple-rules.scm'] self.interpreter(files) def getGroundings(self): #debug(self.Groundings) pass def Output(self, Ground1, Ground2, Ground3): #self.Groundings = self.output ground_1 = None ground_2 = None ground_3 = None if self.Grounds.isGround(Ground1) and self.isVariable(Ground1): ground_1 = self.Grounds.getGrounding(Ground1) elif not self.isVariable(Ground1): ground_1 = Ground1 if self.Grounds.isGround(Ground2) and self.isVariable(Ground2): ground_2 = self.Grounds.getGrounding(Ground2) elif not self.isVariable(Ground2): ground_2 = Ground2 if self.Grounds.isGround(Ground3) and self.isVariable(Ground3): ground_3 = self.Grounds.getGrounding(Ground3) elif not self.isVariable(Ground3): ground_3 = Ground3 if ground_1 and ground_2 and ground_3: debug([ground_1, ground_2, ground_3], prefix="triple") self.hypergraph.add_edge(ground_2, ground_3,\ edge_data=[ground_1], edge_type='triple', with_merge=False) def rule_applied(self, rule): #debug(rule, prefix="rule applied from scheme") #if hasattr(self, "output"): # debug(self.output, prefix="Groundings") pass def hasFlag(self, flag, variable): ## do we have a given flag for x in self.hypergraph.edge_by_type('feature'): head, cur_tag, tail = x if cur_tag[0] == flag: if self.isVariable(variable) and self.Grounds.isGround(variable): value = self.Grounds.getGrounding(variable) else: value = variable if head == value: return True return False def hasFeature(self, feature): for x in self.hypergraph.edge_by_type('feature'): head, cur_tag, tail = x if cur_tag == feature: return True return False def makePhrase(self, Ground1, Ground2): if self.Grounds.isGround(Ground1) and self.Grounds.isGround(Ground2): ground_1 = self.Grounds.getGrounding(Ground1) ground_2 = self.Grounds.getGrounding(Ground2) phrase = '_'.join([ground_1, ground_2]) self.Grounds.groundVariable('$phrase', phrase) def inSentence(self, word): pass def isVariable(self, text): ## is it a variable? if text.startswith('$'): return True else: return False ## Type specifics def isType(self, variable): if self.Types.has_key(variable): return True else: return False def getType(self, variable): if self.isType(variable): return self.Types[variable] else: return False def setType(self, variable, Type): self.Types[variable] = Type ## Grounding specifics def compareGround(self, ground, idx): if self.Grounds.getGrounding(ground) != self.current[idx]: return False else: return True def addLink(self, ground_1, ground_2, type, data=None): if self.Grounds.isGround(ground_1) and self.Grounds.isGround(ground_2): ground_1 = self.Grounds.getGrounding(ground_1) ground_2 = self.Grounds.getGrounding(ground_2) self.hypergraph.add_edge(ground_1, ground_2, edge_data=[data], edge_type=type, with_merge=False) def matchRule(self, rule_set): results = [] state = None stack = [] for matches in self.match_rule_generator(rule_set): ## match to list then append the output ## find and match is a generator results.append(matches[0]) state = matches[1] ## match it outright if rule_set == results: self.output = matches[1] return True return False def replaceOutput(self, ruleOutput, rule): if not ruleOutput: return False output = [] for tag_set in rule: replaced = self.replaceVariable(tag_set, self.output) output.append(replaced) return output def replaceVariable(self, tag_frame, groundings): output = tag_frame for x in xrange(len(tag_frame)): if self.isVariable(tag_frame[x]): if tag_frame[x] in groundings.keys(): output[x] = groundings[tag_frame[x]] return output def matchTemplate(self, rule_set, callback): output_stack = [] for key, value in rule_set.items(): match_rule = callback(value) if match_rule[0]: output = self.replaceOutput(match_rule, value) output_stack.append((key, output)) self.reset() return output_stack def matchRuleSet(self, ruleSet): output_stack = [] for key, value in ruleSet.items(): match_list = self.matchRule(value) if match_list: output = self.replaceOutput(match_list, value) output_stack.append((key, output)) self.reset() return output_stack def lemma(self, word, grounding): if self.isVariable(word) and self.Grounds.isGround(word): groundedWord = self.Grounds.getGrounding(word) stemmed = self.stemmer.stem_word(groundedWord) self.Grounds.groundVariable(grounding, stemmed) elif not self.isVariable(word): stemmed = self.stemmer.stem_word(word) self.Grounds.groundVariable(grounding, stemmed) def compareRule(self, tag, var_1, var_2): ## check to see if we are ground and it is a var if self.isVariable(var_1) and self.Grounds.isGround(var_1): out = self.compareGround(var_1, 0) if not out: return False ## otherwise we need to ground it elif self.isVariable(var_1) and not self.Grounds.isGround(var_1): self.Grounds.groundVariable(var_1, self.current[0]) if self.isVariable(var_2) and self.Grounds.isGround(var_2): out = self.compareGround(var_2, 2) if not out: return False elif self.isVariable(var_2) and not self.Grounds.isGround(var_2): self.Grounds.groundVariable(var_2, self.current[2]) if tag == '$prep': ## XXX: bad way of doing this return True if self.isVariable(tag) and self.Grounds.isGround(tag): self.compareGround(tag, 1) elif self.isVariable(tag) and not self.Grounds.isGround(tag): self.Grounds.groundVariable(tag, self.current[1]) return True def find_next(self, tag): if self.isType(tag): tagType = self.getType(tag) ## dont play with a loaded gun if self.hypergraph.has_edge_type(tagType): for x in self.hypergraph.edge_by_type(tagType): head, cur_tag, tail = x edge_data = cur_tag[0] self.Grounds.groundVariable(tag, edge_data) self.current = (head, edge_data, tail) yield (head, edge_data, tail) else: yield False return for x in self.hypergraph.edge_by_type('feature'): if not isinstance(tag, list): if tag.startswith('$'): head, tag, tail = x self.current = (head, tag[0], tail) yield self.current elif x[1][0] == tag: head, tag, tail = x self.current = (head, tag[0], tail) yield self.current def match_rule_generator(self, rule_set): ## tag list is the list we match to ## it is the rule rule_set = deque(rule_set) ## has to be a deque with something in it if rule_set: popped = rule_set.popleft() self.stack.append(popped) ## pop off the tag list onto the stack tag, var_1, var_2 = self.stack[-1] ## find the next matching tag for x in self.find_next(tag): if x: ## find our matching tag match = self.compareRule(tag, var_1, var_2) if match: ## self.match_stack.append(self.groundings) yield ([tag, var_1, var_2], self.Grounds.getGroundings()) ## run with recursion after yield for x in self.match_rule_generator(rule_set): ## check for sanity's sake if len(rule_set) > 0: tag, var_1, var_2 = rule_set.popleft() else: return #debug((tag, var_1, var_2)) match = self.compareRule(tag, var_1, var_2) #debug(match) if match: ## dump on the stack self.stack.append((tag, var_1, var_2)) ## self.match_stack.append(self.groundings) yield ([tag, var_1, var_2], self.Grounds.getGroundings()) else: ## reset the groundings self.Grounds.variableScope() else: self.Grounds.variableScope()
def glossOverlap(gloss1, gloss2): # stopws = stopwords.words('english') stopws = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'then', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 's', 't', 'just', 'don'] stemmer = PorterStemmer() # print "-"*50 # for x in stopws: # if stemmer.stem_word(x) != x: # print x, stemmer.stem_word(x) def longestOverlap(a, b): now = [0]*len(b) bestOverlap = 0 aStart = 0 bStart = 0 nextNonStopWord = [-1]*(len(a)+1) for i in range(len(a)-1, 0, -1): if a[i] not in stopws: nextNonStopWord[i] = i else: nextNonStopWord[i] = nextNonStopWord[i+1] for i in range(1, len(a)): prev = now now = [0]*len(b) if a[i] == '#': continue for j in range(1, len(b)): if b[j] == '#': continue if a[i] == b[j]: now[j] = max(now[j], prev[j-1] + 1) if a[i] in stopws: continue overlap = now[j] start = i - overlap + 1 start = nextNonStopWord[start] overlap = i - start + 1 if bestOverlap < overlap: bestOverlap = overlap aStart = i - overlap + 1 bStart = j - overlap + 1 return (bestOverlap, aStart, bStart) regex = ',|\.|\s|\?|\'|\"|!|;|-' #maybe check what happens if we don't stem the glosses a1 = ['#'] + [stemmer.stem_word(x.lower()) for x in re.split(regex, gloss1) if x] a2 = ['#'] + [stemmer.stem_word(x.lower()) for x in re.split(regex, gloss2) if x] score = 0 (overlap, start1, start2) = longestOverlap(a1, a2) while overlap > 0: # print overlap # print a1[start1:start1+overlap] # print a2[start2:start2+overlap] a1[start1:start1+overlap] = ['#'] a2[start2:start2+overlap] = ['#'] score += overlap**2 (overlap, start1, start2) = longestOverlap(a1, a2) return score