Esempio n. 1
0
	def __init__(self, words, is_binary=False):
		self._keywords = words
		self._stemmed_keywords = []
		stemmer = PorterStemmer()
		for word in words:
			self._stemmed_keywords.append(stemmer.stem_word(word))
		self._is_binary = is_binary
Esempio n. 2
0
def get_stemmed_terms_list(doc, stem_words_map = None, stem_bigrams_map = None):
    ps = PorterStemmer()
    local_map = dict()
    word_list = []

    clean_doc = [(w.strip()).lower() for w in doc.split() if len(w) in range(3,16)]
    filtered_words = [w.strip('.,;?!:)(#') for w in clean_doc if not w.strip('.,;?!:)(#') in stopwords.words('english')]

    for w in filtered_words:
        if w.isalpha():
            w_temp = ps.stem_word(w)
            if stem_words_map is not None:
                if w_temp not in stem_words_map:
                    stem_words_map[w_temp] = dict()
                stem_words_map[w_temp][w] = stem_words_map[w_temp].get(w, 0)+1
                local_map[w_temp] = w
            word_list.append(w_temp)

    bigrams = nltk.bigrams(word_list)
    for b in bigrams:
        bigram_org = (local_map[b[0]],local_map[b[1]])
        if stem_bigrams_map is not None:
                if b not in stem_bigrams_map:
                    stem_bigrams_map[b] = dict()
                stem_bigrams_map[b][bigram_org] = stem_bigrams_map[b].get(bigram_org, 0)+1

    return word_list, bigrams
Esempio n. 3
0
	def extract(self, line):
		"""
		find word pairs that co-occur and extract # of minimum distance word pairs in the line
		"""
		words = self.tokenize(line.lower())
		count = 0.0
		stemmer = PorterStemmer()
		bad_indices = [] 
		you_indices = [] 
		for i in range(len(words)):
			word = words[i] 
			if word in self._youwords: 
				you_indices.append(i)
			word = stemmer.stem_word(word)
			if word  in self._stemmed_badwords or self.isWordPartOf(word,self._badwords): 
				bad_indices.append(i)
			
		 
		if not bad_indices or not you_indices: 
			return [-1]
		else: 
			#print line 
			#print bad_indices
			#print you_indices
			distances = [] 
			for bindex in bad_indices:
				for yindex in you_indices: 
					distances.append(abs(bindex - yindex))
			#print distances
			mn = min(distances)
			count = sum([1 for d  in distances if d == mn])
			#return [(count *1.0)* mn/len(line)]		
			return [1]
Esempio n. 4
0
def get_stemmed_terms_list(doc, stem_words_map=None, stem_bigrams_map=None):
    ps = PorterStemmer()
    local_map = dict()
    word_list = []

    clean_doc = [(w.strip()).lower() for w in doc.split()
                 if len(w) in range(3, 16)]
    filtered_words = [
        w.strip('.,;?!:)(#') for w in clean_doc
        if not w.strip('.,;?!:)(#') in stopwords.words('english')
    ]

    for w in filtered_words:
        if w.isalpha():
            w_temp = ps.stem_word(w)
            if stem_words_map is not None:
                if w_temp not in stem_words_map:
                    stem_words_map[w_temp] = dict()
                stem_words_map[w_temp][w] = stem_words_map[w_temp].get(w,
                                                                       0) + 1
                local_map[w_temp] = w
            word_list.append(w_temp)

    bigrams = nltk.bigrams(word_list)
    for b in bigrams:
        bigram_org = (local_map[b[0]], local_map[b[1]])
        if stem_bigrams_map is not None:
            if b not in stem_bigrams_map:
                stem_bigrams_map[b] = dict()
            stem_bigrams_map[b][bigram_org] = stem_bigrams_map[b].get(
                bigram_org, 0) + 1

    return word_list, bigrams
Esempio n. 5
0
	def __init__(self, badwords,youwords):
		self._badwords = badwords
		self._stemmed_badwords = []
		self._youwords = youwords
		self._part_of_badword = {}    #cache of words that start or end with offensive content
		stemmer = PorterStemmer()
		for word in badwords:
			self._stemmed_badwords.append(stemmer.stem_word(word))
Esempio n. 6
0
 def _getFeatures(self, corpus):
     stemmer = PorterStemmer()
     tokens = corpus.split(" ")
     features = filter(lambda x: len(x) > 1, tokens)
     
     finalList = [] 
     for feature in features :
         feature = re.sub("[^a-zA-Z0-9']", "", feature.lower())
         finalList.append(stemmer.stem_word(feature))
         
     return finalList
Esempio n. 7
0
	def __init__(self, wordlist1,wordlist2,mindist = 1,maxdist=100):
		self._wordlist = []
 		stemmer = PorterStemmer()
		self._mindistance = mindist
		self._maxdistance = maxdist
		for word1 in wordlist1:
			for word2 in wordlist2: 
			   word1 = stemmer.stem_word(word1)
			   self._wordlist.append(word1 + word2)
			   self._wordlist.append(word1 + "-" + word2)
			   self._wordlist.append(word1)
Esempio n. 8
0
	def extract(self, line):
		
		words = self.tokenize(line.lower())
		count = 0.0
		stemmer = PorterStemmer()
		for word in words:
			word = stemmer.stem_word(word)
			if word in self._stemmed_keywords:
				count += 1
		if self._is_binary:
			return [1] if count > 0 else [0]
		else:
			return [count]
Esempio n. 9
0
	def extract(self, line):
		"""
		find word pairs that co-occur and extract # of minimum distance word pairs in the line
		"""
		words = self.tokenize(line.lower())
		 
		stemmer = PorterStemmer()
		 
		for i in range(len(words)):
			word = stemmer.stem_word(words[i])
			if word in self._wordlist: 
			   return [1.0]
		return [0.0]
Esempio n. 10
0
 def getFeatures(self, corpus):
     stemmer = PorterStemmer()
     stems = FreqDist()
     onlyLettersNumbers = re.compile('[^a-zA-Z0-9%!]')
     corpus = onlyLettersNumbers.sub(' ', corpus.lower())
     corpus = TreebankWordTokenizer().tokenize(corpus)
     
     count = 0
     for word in corpus :
         if not stopwords.STOP_WORDS.get(word) and len(word.strip()) > 1 :
             stems.inc(stemmer.stem_word(word))
             count += 1
             if self.__maxFeatures > 0 and count >= self.__maxFeatures :
                 break
             
     features = stems.samples()
     
     return features
from nltk.stem.porter import PorterStemmer
import re
NORMALIZE_TERM_REG = "[^a-zA-Z0-9 ]"
porter = PorterStemmer()
lower_term = lambda term: term.lower()
terms_filter = lambda term: term !=None and len(term.strip()) != 0
normalize = lambda term : re.sub(NORMALIZE_TERM_REG, "", term)
remove_space = lambda term: term.strip()
stem_terms = lambda term: porter.stem_word(term)



def split_and_normalize_terms(terms, pattern):
	new_terms = filter(terms_filter, terms)
	new_terms = split_terms(new_terms, pattern)
	new_terms = map(normalize, new_terms)
	new_terms = filter(terms_filter, new_terms)
	new_terms = map(remove_space, new_terms)
	return new_terms

def filter_and_normalize_terms(terms):
	new_terms = filter(terms_filter, terms)	
	new_terms = map(normalize, new_terms)
	new_terms = filter(terms_filter, new_terms)
	return new_terms

def split_terms(terms, pattern):
	new_terms = []
	for term in terms:
		splited_terms = re.split(pattern, term)
		new_terms.extend(splited_terms)
Esempio n. 12
0
class Merger:
    SKOS = Namespace('http://www.w3.org/2004/02/skos/core#')
    EXA = Namespace('http://www.example.com/#')

    def __init__(self, files, verbose):
        """Create a new instance.
        init takes two paramters, **file** and **verbose**. **file** is a list
        of files that will become global to this class.
        **verbose** sets the level of logging. If set to none, the loglevel
        is ``warning`` and will never be called. Debug will set the
        loggerlevel to ``debug`` - this would get you a lot of information.
        File, screen and both will set the level to info and tell the logger
        where the output should go. The logfile is called ``parserLog.txt``.
        init also creates a resultgraph, that contains all thesauri and their
        mappings.
        :param files: list of thesauri
        :type files: list
        :param verbose: A string of (screen, file, both, none, debug)
        :type verbose: string
        """
        if verbose not in ('screen', 'file', 'both', 'none', 'debug'):
            sys.exit('Value of verbose must be screen, file, both, debug'
                     ' or none')
#         Creating a logging instance and
 #        enable different levels based on verbose
        self.logger = logging.getLogger()
        if verbose == 'none':
            self.logger.setLevel(logging.WARNING)
        elif verbose == 'debug':
            self.logger.setLevel(logging.DEBUG)
        else:
            self.logger.setLevel(logging.INFO)
        self.formatter = logging.Formatter(
                         '%(asctime)s - %(levelname)s - %(message)s')
        self.log(output=verbose)
        self.logger.debug('Logger created')
        self.logger.debug('Initalizing global variables')
        #self.files = files  # List of input files
        self.porter = PorterStemmer()
        self.result = ConjunctiveGraph()  # Where it ends
        self.graphlist = self.parseGraphs(files)
        self.mergeFiles(self.graphlist)
        self.logger.debug('Got %s inputfiles' % (len(files)))
       # self.graph = {}  # contains the parsed input files
        self.logger.info('Merger initiated')
        self.addContext()
        self.reporting = {}
        self.reporting['equals'] = 0
        self.reporting['substrings'] = 0
        self.reporting['phrase'] = 0
        self.reporting['related'] = 0

    def log(self, output='both'):
        """this method creates a filehandle and a screenhandle for
        the logger. Depending on the output variable, it will call
        logToFile, to create the filehandle or logToScreen, to
        create the screenhandle, or both. If output is debug,
        log creates two handles as well. If output is none, nothing
        happens.

        Expected parameter: file, screen, both, debug, none"""
        def logToFile():
            """Create a filehandle for logging. The file is called
            parserLog.txt. Loglevel is set to debug, so both, info and
            debug will be written.
            TODO: User gets to decide filename and location
            """
            fh = logging.FileHandler('parserLog.txt')
            fh.setLevel(logging.DEBUG)
            fh.setFormatter(self.formatter)
            self.logger.addHandler(fh)

        def logToScreen():
            """Create screenhandle for logging. All logs get written
            onto screen. Loglevel is set to debug, so both, info and
            debug will be written.
            TODO: redundant?
            """
            scr = logging.StreamHandler()  # Print on screen
            scr.setLevel(logging.DEBUG)
            scr.setFormatter(self.formatter)
            self.logger.addHandler(scr)
        if output == 'both' or 'debug':
            logToFile()
            logToScreen()
        if output == 'file':
            logToFile()
        if output == 'screen':
            logToScreen()

    def parseGraphs(self, files):
        """This Method takes the input file names and
        parses them.
        return: graphs - a list of ConjunctiveGraphs
        """
        self.logger.info('Parsing the input files: %s' % (files))
        graphs = []
        for i in range(len(files)):
            graphs.append(ConjunctiveGraph('IOMemory'))
        for i in range(len(files)):
            graphs[i].parse(files[i], format='xml')
        self.logger.debug('Graphlist created. Length is %s' % (len(graphs)))
        return graphs

    def mergeFiles(self, graphs):
        """Calls the addContent-method on each graph in self.graph dictionary.
        This will write all graphs into one graph (the resultgraph)
        """
        self.logger.info('Merging inputfiles into resultgraph...')
        for i in range(len(graphs)):
            self.addContent(graphs[i])

    def addContext(self):
        """
        Adding the namespacebinding for SKOS and a custom namespace EXA.
        TODO: Own Method for custom NS binding?
        """
        self.result.bind('skos', Merger.SKOS)
        self.result.bind('exa', Merger.EXA)
        self.addToResult(Merger.EXA.distantMatch, RDF.type, OWL.ObjectProperty)


    def getLabels(self, graph):
        """getLabels takes a ConjunctiveGraph instance and finds all
        SKOS:prefLabel and rdfs:label. It returns an dictionary with {uri:label}.
        param: graph - a ConjunctiveGraph instance
        return: compict - a dictionary of all {uri:label} for a graph
        """
        compdict = {}
        self.logger.info('Getting labels from %s' % (graph))
        if (None, Merger.SKOS.prefLabel, None) in graph: 
            for uri, label in graph.subject_objects(Merger.SKOS.prefLabel):
                compdict[uri] = label.toPython().strip().lower()
        if (None, URIRef("http://www.w3.org/2000/01/rdf-schema#label"), None) in graph:
            for uri, label in graph.subject_objects(
                   URIRef("http://www.w3.org/2000/01/rdf-schema#label")):
                compdict[uri] = label.toPython().strip().lower() 
        return compdict

    def removeDiacritics(self, label):
        """
        This method uses unicodedata() to remove diacritics from a string
        TODO: Does this work without unicodedata?
        param: string
        return: string
        """
        label = ''.join((c for c in unicodedata.normalize('NFD', unicode(label)) if unicodedata.category(c) != 'Mn'))
        return label

    def removePunctuation(self, label):
        """This method removes punctuations. Right now, it will remove
        '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        param: string
        return: string
        """
        for punct in string.punctuation:
            label = label.replace(punct," ")
        return label

    def stemWords(self, label):
        """This method stems a single word or a phrase
        param: string
        return: string
        """ 
        
        label = " ".join(self.porter.stem_word(word) for word in label.split(" "))
        return label

    def __broaders(self, uri, graph):
        """This "private" method is a generator over broaderTerms of a given URI in
        a given ConjunctiveGraph. Use getParents() to obtain the list of broaderTerms
        """
        for n in graph.transitive_objects(URIRef(uri), URIRef('http://www.w3.org/2004/02/skos/core#broader')):
            if (uri==n):
                continue
            yield n

    def __narrowers(self, uri, graph):
        """This "private" method is a generator over narrowerTerms of a given URI in
        a given ConjunctiveGraph. Use getchildren() to obtain the list of narrowerTerms
        """
        for n in graph.transitive_objects(URIRef(uri), URIRef('http://www.w3.org/2004/02/skos/core#narrower')):
            if (uri==n):
                continue
            yield n

    def getParents(self, uri, graph):
        list = []
        for n in self.__broaders(uri, graph):
            for label in graph.objects(n, URIRef("http://www.w3.org/2004/02/skos/core#prefLabel")):
                list.append(label.toPython().strip().lower())
        return list

    def getChildren(self, uri, graph):
        list = []
        for n in self.__narrowers(uri, graph):
            for label in graph.objects(n, URIRef("http://www.w3.org/2004/02/skos/core#prefLabel")):
                list.append(label.toPython().strip().lower())
        return list

    def isSameTerm(self, label1, label2):
        """returns true if two items are equal
        params: string
        """
        if label1 ==label2:
            self.logger.debug('Identical terms found %s - %s' % (label1, label2))
            return True

    def isSubstring(self, label1, label2):
        """returns true if one item is a substring of the other.
        params: string
        """
        if len(label1.split(' ')) == 1 and len(label2.split(' ')) == 1:
            # if both split-operations have returned
            # one word (were not successful)
            if (label1 in label2 or label2 in label1):
                self.logger.debug('Substring found for %s and %s'
                                  % (label1, label2))
                return True

    def isPhrase(self, label1, label2):
        """returns true if one item is a phrase containing the other item.
        params: string
        """
        if len(label1.split(' ')) > 1 or len(label2.split(' ')) > 1:
            # if one of the lists holds more than one word (is a phrase)
            for word in label1.split(' '):
                for word2 in label2.split(' '):
                    if word == word2:
                        return True

    def termSignatur(self, label):
        """This method will return the term signatur of phrase.
        It stems and sort the phrase.
        param: string
        return: string
        """
        return ''.join(sorted(self.porter.stem_word(word) for word in label.split(" ")))

    def isSameSig(self, label1, label2):
        if len(label1.split(' ')) > 1 and len(label2.split(' ')) > 1:
            label1 = self.termSignatur(label1)
            label2 = self.termSignatur(label2)
            if label1 == label2:
                return True

    def isConceptScheme(self, uri):
        """returns true if a uri is a SKOS:ConceptScheme
        TODO: Method for custom "type"-checking
        """
        i = 0
        for triple in self.result.triples((uri, RDF.type,
                                           Merger.SKOS.ConceptScheme)):
            i += 1
        if i > 0:
            self.logger.debug('Is a ConceptScheme: %s' % (uri))
            return True
        return False

    def addEquals(self, uri1, uri2):
        """addEquals takes two URIs and adds them to the resultgraph.
        if one of the URIs is a SKOS:ConceptScheme, the predicate will
        be EXA:distantMatch instead of SKOS:CloseMatch.
        """
        self.reporting['equals'] += 1
        if self.isConceptScheme(uri1) or self.isConceptScheme(uri2):
            self.logger.debug('Adding equal Concept - ConcepScheme relation')
            self.addToResult(uri1, Merger.EXA.distantMatch, uri2)
        else:
            self.logger.debug('Adding equal terms')
            self.addToResult(uri1, Merger.SKOS.closeMatch, uri2)

    def addRelated(self, uri1, uri2):
        """addEquals takes two URIs and adds them to the resultgraph.
        if one of the URIs is a SKOS:ConceptScheme, the predicate will
        be EXA:distantMatch instead of SKOS:CloseMatch.
        """
        self.reporting['related'] += 1
        if self.isConceptScheme(uri1) or self.isConceptScheme(uri2):
            self.logger.debug('Adding equal Concept - ConcepScheme relation')
            self.addToResult(uri1, Merger.EXA.distantMatch, uri2)
        else:
            self.logger.debug('Adding equal terms')
            self.addToResult(uri1, Merger.SKOS.semanticRelation, uri2)

    def addSubstrings(self, uri1, uri2):
        """addSubstrings takes two URIs and adds them to the resultgraph.
        if one of the URIs is a SKOS:ConceptScheme, the predicate will
        be EXA:distantMatch instead of SKOS:relatedMatch.
        """
        self.reporting['substrings'] += 1
        if self.isConceptScheme(uri1) or self.isConceptScheme(uri2):
            self.logger.debug('Adding substr Concept - ConcepScheme relation')
            self.addToResult(uri1, Merger.EXA.distantMatch, uri2)
        else:
            self.logger.debug('Adding substrings')
            self.addToResult(uri1, Merger.SKOS.relatedMatch, uri2)

    def addPhrase(self, uri1, uri2):
        """addPhrase takes two URIs and adds them to the resultgraph.
        if one of the URIs is a SKOS:ConceptScheme, the predicate will
        be EXA:distantMatch instead of SKOS:relatedMatch.
        """
        self.reporting['phrase'] += 1
        if self.isConceptScheme(uri1) or self.isConceptScheme(uri2):
            self.addToResult(uri1, Merger.EXA.distantMatch, uri2)
            self.logger.debug('Adding phrase Concept - ConcepScheme relation')
        else:
            self.logger.debug('Adding phrase')
            self.addToResult(uri1, Merger.SKOS.relatedMatch, uri2)

    def addToResult(self, s, p, o):
        """Adds a triple to the resultgraph"""
        self.result.add((s, p, o))

    def writeToFile(self, dest, ext):
        """Serializes the resultgraph into a file
        TODO: custom filename and format
        """
        self.logger.info('Writing output...')
        self.result.serialize(destination=dest, format=ext)
        self.logger.info('Done.')

    def addContent(self, graph):
        """Adds all triples from a graph to the resultgraph"""
        for s, p, o in graph:
            self.addToResult(s, p, o)
Esempio n. 13
0
class DateExtractor(object):
    """Module for extracting date expressions from text.
    Examples of recognized dates:
        - 10 days
        - ten days
        - 11/10/2106
        - 12 months
        - one year.
    """

    GRAMMAR = r"""
        DATE: {<JJ|NN|NNP|CD|VB><\(><CD><\)><VBG|NN><NNS|NN|JJ>} # thirty (30) working|business days
              {<JJ|NN|NNP|CD|VB><\(><CD><\)><NNS|NN|JJ>} # thirty (30) days
              {<CD><NN><NNS|NN>} # thirty business days
              {<CD><NN|NNS|JJ>} # 10 days, 1 year
    """

    def __init__(self):
        super(DateExtractor, self).__init__()

        self.parser = nltk.RegexpParser(self.GRAMMAR)
        self.stemmer = PorterStemmer()

    def extract_dates(self, text):
        sentences = self._get_sentences(text)
        tagged_sentences = [nltk.pos_tag(sent) for sent in sentences]

        result = []
        for sentence in tagged_sentences:
            tree = self.parser.parse(sentence)

            for expression in self._extract_data_from_tree(tree):
                if not expression or self._is_false_positive(expression, sentence):
                    continue

                expression = self._extend_to_left(expression, sentence)
                result.append(expression)

        return result

    def _extend_to_left(self, expression, tagged_sentence):
        """Try to complete the first numeral. It's a healing method for the next scenario:
        seventy two (72) days.
        """
        num_text = self._extract_number_from_expression(expression)
        if not num_text:
            return expression

        if expression.startswith(num_text):
            # Expression is correctly formatted
            return expression

        # If the text would have contained '-' the expression would have been correct.
        num_words = num_text.split('-')

        if not expression.lower().startswith(num_words[-1]):
            print 'Expression %s does not start with the expected word %s' % (expression,
                                                                              num_words[-1])
            return expression

        sentence = join_sentence([t[0] for t in tagged_sentence])

        # Last word is in the expression, the other words we expect to find in the subsentence.
        # The fact that there can be more such expressions in a sentence was considered but we
        # should not do anything special for them because agreements have a pseudo structure which
        # leads to consistency in terms of formats.
        idx = sentence.find(expression)

        subsentence = sentence[:idx].strip().split()
        subsentence.reverse()

        num_words = num_words[:-1]
        num_words.reverse()

        idx = 0
        wc = len(num_words)
        while idx < wc and num_words[idx] == subsentence[idx]:
            expression = '%s %s' % (num_words[idx], expression)
            idx += 1

        return expression

    def _extract_number_from_expression(self, expression):
        number_search = re.search('\(([0-9]+)\)', expression)
        if not number_search:
            return None

        try:
            number = int(number_search.groups()[0])
        except:
            print "Could not extract number from expression: %s" % expression
            return None

        return num2words(number)

    def _extract_data_from_tree(self, tree):
        expressions = []
        for subtree in tree.subtrees():
            if not subtree.label() == 'DATE':
                continue

            expressions.append(join_sentence([t[0] for t in subtree.leaves()]))

        return expressions

    def _get_sentences(self, text):
        sentences = nltk.sent_tokenize(text)
        # Remove new lines
        sentences = [s.replace('\r\n', ' ') for s in sentences]

        # Collapse whitespaces
        rex = re.compile(r'[ \t]+')
        sentences = [rex.sub(' ', s) for s in sentences]

        sentences = [nltk.word_tokenize(sent) for sent in sentences]
        return sentences

    def _is_false_positive(self, expression, tagged_sentence):
        # The last token should be either time unit or 'period'
        time_unit = expression.split()[-1]

        stem = self.stemmer.stem_word(time_unit)
        if stem not in ALLOWED_STEMS:
            return True

        if stem == YEAR_STEM:
            # Check if the sentence represents an age expression (it is followed by "of age"
            # or "old").
            sentence = join_sentence([t[0] for t in tagged_sentence])
            idx = sentence.find(expression) + len(expression)
            subsentence = sentence[idx:].strip()
            # Note: This check may fail if in the same sentence there are both age expressions and
            # date expressions in years. This should not be a problem since no reviewed document
            # has this case (it also doesn't make sense in the pseudo-structure of agreements).
            if any(subsentence.startswith(expr) for expr in AGE_EXPRESSIONS):
                return True

        return False
Esempio n. 14
0
class BooleanSearch(object):
    """
    This class handles the parsing and execution of queries from a provided query file based on
    the index and postings file provided upon initialization. Results are saved into an output file.
    Parsing of queries is handled by converting the boolean expressions into an equivalent python
    expression that is executable.
    """
    eval_index_local = "index"
    eval_globals = {"__builtins__": None}
    replacements = [("AND NOT", "-"), ("AND", "&"), ("OR", "|"), ("NOT", "~"), ("(", " ( "), (")", " ) ")]
    exprs = set(["-", "&", "|", "~", "(", ")"])
    expr_postings_ref = "index[\"%s\"]"

    def __init__(self, index_filename, postings_filename):
        """
        index_filename refers to the dictionary file.
        postings_filename refers to the postings file.
        """
        self.stemmer = PorterStemmer()
        self.index = Index(index_filename, postings_filename)
        self.eval_locals = {self.eval_index_local: self.index}

    def _to_python_expression(self, query):
        """
        Parses a boolean expression by converting the boolean operator keywords into python's bitwise operators,
        and converts the terms into their respective index calls that return SkipList objects.
        The resulting expression is an executable python expression.

        WARNING: NOT SAFE FOR PRODUCTION SYSTEMS. FOR ACADEMIC PURPOSES ONLY.
        """
        query = reduce(lambda q,args: q.replace(*args), self.replacements, query)
        query_list = [x not in self.exprs and self.expr_postings_ref % self.stemmer.stem_word(x.lower()) or x for x in query.split()]
        return " ".join(query_list)

    def _execute_query(self, query):
        """
        Executes the provided query and returns the result

        WARNING: NOT SAFE FOR PRODUCTION SYSTEMS. FOR ACADEMIC PURPOSES ONLY.
        """
        expression = self._to_python_expression(query)
        try:
            result = eval(expression, self.eval_globals, self.eval_locals)
        except SyntaxError as se:
            return "Syntax Error occurred, possible malformed expression during conversion: %s" % expression
        except NameError as ne:
            return "Name Error occured, possible invalid object reference in query: %s" % expression
        else:
            return result

    def process_queries(self, query_filename, output_filename):
        """
        This method takes in a query filename and output filename.
        For every query, it writes the output into a new line.
        """
        try:
            with open(query_filename, 'r') as query_file, open(output_filename, 'w') as output_file:
                for row in query_file:
                    result = self._execute_query(row)
                    output_file.write(str(result) + "\n")
        except IOError as error:
            print "IO Error occured while attempting to run BooleanSearch"
            sys.exit(error.args[1])
        if(word!=False):
            fullSearchURL=baseSearchURL+word
            f = urllib.urlopen(fullSearchURL)
            for line in f.read().split("\n"):
                found=False
                if("video src" in line):
                    endURL=line[12:]
                    endURL=endURL[:endURL.index("\"")]
                    fullURL=baseURL+endURL
                    print 'getting video from'+fullURL
                    found=True
                    break
            if(found):
                urllib.urlretrieve(fullURL,word+'.mp4')
        else:
            p = PorterStemmer()
            word=PorterStemmer.stem_word(p,origword)
            fullSearchURL=baseSearchURL+word
            f = urllib.urlopen(fullSearchURL)
            for line in f.read().split("\n"):
                found=False
                if("video src" in line):
                    endURL=line[12:]
                    endURL=endURL[:endURL.index("\"")]
                    fullURL=baseURL+endURL
                    print 'getting video from'+fullURL
                    found=True
                    break
            if(found):
                urllib.urlretrieve(fullURL,word+'.mp4')
Esempio n. 16
0
def get_word_counts( solr, fq, query, num_words, field='sentence' ) :
    print query

    print str(time.asctime())

    start_time = time.time()

    function_start_time = start_time
    
    results = fetch_all( solr, fq, query, 'sentence' )
    print "got " + query
    print len( results )
    print time.asctime()

    end_time = time.time()
    print "time {}".format( str(end_time - start_time) )

    start_time = end_time

    print 'converting to utf8 and lowercasing';
    sentences = [ result['sentence'].lower() for result in results ]

    results = None

    end_time = time.time()
    print "time {}".format( str(end_time - start_time) )

    start_time = end_time


    print 'calculating non_stemmed_wordcounts'
    term_counts = non_stemmed_word_count( sentences )

    if '' in term_counts:
        del term_counts['']

    print "Returned from non_stemmed_word_count"
    print time.asctime()
    end_time = time.time()
    print "time {}".format( str(end_time - start_time) )

    start_time = end_time
    print "freeing sentences "
    sentences = None
    
    end_time = time.time()
    print "time {}".format( str(end_time - start_time) )


    start_time = end_time

    print 'stemming and counting'

    stem_counts = collections.Counter()

    st = PorterStemmer()
    for term in term_counts.keys():
        #ipdb.set_trace()
        stem = st.stem_word( term )
        stem_counts[ stem ] += term_counts[ term ]


    end_time = time.time()
    print "done stemming and counting "
    print "time {}".format( str(end_time - start_time) )

    start_time = end_time

    print ' calcuating stem to term map '
    stem_to_terms = {}
    for term in term_counts.keys():
        stem = st.stem_word( term )
        if stem not in stem_to_terms:
            stem_to_terms[ stem ] = []

        stem_to_terms[stem].append( term )

    print "done calcuating stem to term map "
    print "time {}".format( str(end_time - start_time) )


    counts = stem_counts.most_common( num_words )

    ret = [ ]
    for stem, count in counts:
        if len( stem_to_terms[ stem ] ) < 2:
            term = stem_to_terms[ stem][0]
        else:
            best_count = 0
            for possible_best in stem_to_terms[ stem ] :
                if term_counts[ possible_best ] > best_count:
                    term = possible_best
                    best_count = term_counts[ possible_best ]

        ret.append( 
            { 'stem': stem, 
              'term': term,
              'count': count
              } )


    end_time  = time.time()
    print "total time {}".format( str(end_time - function_start_time) )

    return ret
Esempio n. 17
0
  db = c['haiku']

  # Open connection to Twitter's public timeline, build haikus
  failed = True
  while failed:
    failed = False
    try:
      with tweetstream.SampleStream(USER,PASS) as stream:
        for tweet in stream:
          if 'text' in tweet and len(tweet['text'])>0:
            screen_name = tweet['user']['screen_name']
            hashes = [j for j in set([i for i in tweet['text'].split() if i.startswith('#')])]
            # Strip out urls, punctuation, RTs, and @'s
            tweet_stripped = urlre.sub('',tweet['text'])
            tweet_stripped = punctre.sub('',tweet_stripped)
            tweet_stemmed = [porter_stemmer.stem_word(i.lower()) for i in tweet_stripped.split()]
            # Keep unstemmed, stripped tweet for either storage or retweeting
            tweet_outgoing = [i.lower() for i in tweet_stripped.split()]
            # hack to make sure that only coherent tweets are passed through
            temp_tweet = [i.lower() for i in tweet_stemmed if not i.lower().startswith('rt')]
            tweet_for_topic = [i.lower() for i in tweet_stemmed if not i.lower().startswith('rt') and nsyl(i)>0 and i.lower() not in stopwords]
            if tweet_for_topic==temp_tweet and len(tweet_for_topic)>0:
              print 'Iteration '+str(counter)
              #Assign this tweet a topic
              docset = []
              docset.append(' '.join(i for i in tweet_for_topic))
              print 'Tweet: '+docset[0]
              (gamma, bound) = olda.update_lambda(docset)
              counter+=1
              if (counter % 100 == 0):
                numpy.savetxt('lambdas/lambda-%d.dat' % counter, olda._lambda)
Esempio n. 18
0
class QueryHandler(index_handler.IndexHandler):
    '''
    A class that handles queries upon our INDEX.
    
    It provides functions to deal with boolean retrieval or vector space model retrieval.
        
    '''
    
    def __init__(self, **kwargs):
        index_handler.IndexHandler.__init__(self,  **kwargs)    
        self.limit = kwargs.get('limit',10) 
        self.query = "" 
        self.filters = set()
        self.known_filters = FILTERS
        self.debug = kwargs.get('debug',True)         
        self.stemmer = PorterStemmer()
        self.res_cache_db = kwargs.get('res_cache_db',None)  
        self.res_cache_exp = kwargs.get('res_cache_exp',100)
        self.serializer = serializer
        self.tfidf_w = kwargs.get('tfidf_w',0.33)
        self.title_w = kwargs.get('title_w',0.33)
        self.posting_w = kwargs.get('posting_w',0.33)


    def clear(self):
        self.filters = set()
        self.query = ""         
 
        
    def process_query(self, query):
        ''' entry point for query processing '''
        
        self.clear()
        initial_query = query
        self.query = query
        if self.debug: print "INITIAL QUERY:", initial_query
        
        self.apply_filters()
        self.clean_stem_query()
                   
        if len(self.query.split()) == 1:             
            res = self.exec_single_query(self.query)    
        elif "title_only" in self.filters:             
            res = self.get_titles(self.query.split())                
        else:                                           
            weighted_terms = self.filter_query()   
            res = self.vector_retrieval(weighted_terms)
            
        if res:
            external_ids = self.resolve_external_ids([i[0] for i in res])
            res = [(external_ids[i], res[i][1]) for i in xrange(len(res))]
    
            if self.res_cache_db:
                try:
                    self.res_cache_db.set(initial_query, self.serializer.dumps(res))
                except:
                    raise Exception, "CACHING SEARCH RESULT FAILED, UNREACHABLE DB"    
    
        return res
        



    def apply_filters(self):
        for i in self.known_filters:   
            if re.search(self.known_filters[i], self.query.split()[-1]):
                self.filters.add(i)
                self.query = re.sub(self.known_filters[i],"",self.query)   
                 
        if self.debug: print "WITH FILTERS:", self.filters        
        if not len(self.filters): self.filters.add("complete")
        
        

    def clean_stem_query(self):
        q = ""
        for token in re.sub(r"[.,:;\-!?\"']", " ", self.query).split():
            try: 
                lower = token.lower()
                if stringcheck.check(lower):
                    q += self.stemmer.stem_word(lower) + " "         
            except: 
                if self.debug: print "Probable unicode error in stemming query"  
                
        self.query = q    
        if self.debug: print "STEMMED QUERY:", self.query



    def filter_query(self):
        ''' 
        Discovers document frequencies of query terms
        Returns a list of tuples of all terms that appear in the index
        Format = (term,df)
        '''
        return self.get_dfs(self.query.split())


    def exec_single_query(self, query):
        ''' optimized for a single query '''
        if self.debug: print "In exec single query"
        if "title_only" in self.filters:
            return [(i,1) for i in self.db.smembers("T%s"%query.strip())]
        elif "pure_tfidf" in self.filters:    
            return self.db.zrevrange(query.strip(), 0, self.limit - 1 , withscores=True)
        else:
            q = query.strip()
            res = self.db.zrevrange(q, 0, self.limit - 1 , withscores=True)
            dids = list([i[0] for i in res])
            title_rank = self.get_title_hit([q], dids)
            new_doc_ids = []
            for i, stuff in enumerate(res):
                new_doc_ids.append( (stuff[0], self.weighted_ranking(tfidf=stuff[1], title=title_rank[i])) )    
                
            if self.debug: print "RESULTS " ,   sorted(new_doc_ids, key=operator.itemgetter(1), reverse=True)
            
            return sorted(new_doc_ids, key=operator.itemgetter(1), reverse=True)
        

    def get_titles(self, term_list):
        docs = list(self.db.sinter(["T%s"%term for term in term_list]))
        if docs:
            for term in term_list:  
                self.pipe.hmget("&T%s"%term, docs)
                
                
            ranked = []    
            for i, v in enumerate(itertools.izip_longest(*self.flush())): 
                score = 0
                for j in xrange(len(v) - 1):
                    score += 1.0/(float(v[j+1]) - float(v[j]))
                ranked.append((docs[i], score))
                
            return sorted(ranked, key=operator.itemgetter(1), reverse=True)
        
        return []


    def vector_retrieval(self, weighted_terms):
        ''' 
        A function to start vector space model retrieval
        Intersects all docIDs for every term in term_list
        Returns sorted tfidf-weighted docids
        '''
        if self.debug: print "performing vector retrieval on " , weighted_terms
        terms = [i[0] for i in weighted_terms]
        query_key = "".join(terms)

        self.pipe.zinterstore(query_key, dict(weighted_terms))
        self.pipe.zrevrange(query_key, 0, self.limit - 1 , withscores=True)
        
        doc_ids = self.flush()[1]
        if not len(doc_ids):
            return None
        
        return self.rank_results(doc_ids, terms) 


            
        

    def rank_results(self, doc_ids, terms):

        if "pure_tfidf" in self.filters:
            if self.debug: print "RESULTS ", doc_ids
            return doc_ids

        elif "complete" in self.filters:
            
            dids = list([i[0] for i in doc_ids])
            
            # rank by title
            title_rank = self.get_title_hit(terms, dids)
            
            # must do proximity ranking
            # get the posting lists
            sh = self.get_postings(terms, dids) # actually, I wanted to name this "shit"

            posting_rank = []

            for v in itertools.izip_longest(*sh):      # decompose list of lists  

                try: posting_rank.append( ( self.proximity_rank( self.unfold_postings([ [int(k) for k in j.split(",")] for j in v]) ) ) )
                except: pass
                
            new_doc_ids = []
            
            for i, stuff in enumerate(doc_ids):

                new_doc_ids.append( (stuff[0], self.weighted_ranking(tfidf=stuff[1], title=title_rank[i], posting=posting_rank[i] )) )    
                
            if self.debug: print "RESULTS " ,   sorted(new_doc_ids, key=operator.itemgetter(1), reverse=True)
            
            return sorted(new_doc_ids, key=operator.itemgetter(1), reverse=True)
        

    
    


  
            
        

#############################################################################################################
# RANKING FUNCTIONS
#############################################################################################################  


    def weighted_ranking(self, **kwargs):
        '''
        kwargs carry the scores to be multiplied
        '''
        tfidf = kwargs.get('tfidf', 0)
        title = kwargs.get('title', 0)
        posting = kwargs.get('posting', 0)
        
        return tfidf*self.tfidf_w + title*self.title_w + posting*self.posting_w
        


    def proximity_rank(self, list_of_lists):  
        '''
        A ranking function that calculates a score for words' proximity.
        This score is defined as the sum of 1/Prox for every continuous matches of them.
        Prox is a number indicating how close the words are
        
        example: for words A and B, their postings are [1,4,10] and [2,6,17]
        
                 then score = 1/(2 - 1 + 1) + 1/(6 - 4 + 1) + 1/(17 - 10 + 1)
        '''        
        def sub(*args):
            return reduce(lambda x, y: y-x, args )
        
        _len = len(list_of_lists) - 1
        
        # add padding to shorter lists
        biggest = max([len(i) for i in list_of_lists])
        
        for i in list_of_lists:
            while len(i) != biggest:
                i.insert(0,i[0])
                
        
        score = 0
       
        while True: 
            
            try:
                # get all heads
                _tuple = [i.pop(0) for i in list_of_lists]
            
                for i in xrange(1,len(_tuple)):
                    # ensure we keep order of postings
                    while _tuple[i] - _tuple[i-1] < 0:
                        _tuple.pop(i)
                        _tuple.insert(i, list_of_lists[i].pop(0))
                
                score_vector =  [i - _len for i in map(sub, _tuple)]       
                #print _tuple  , score_vector[-1] - score_vector[0] - _len + 1
                score += 1.0/(score_vector[-1] - score_vector[0] - _len + 1) # ensure no division with 0

            except: break
        
        return score



                

#############################################################################################################
# HELPER FUNCTIONS
#############################################################################################################  

 
    def unfold_postings(self, list_of_lists):
        ''' reverses gap encoding '''
        new_list_of_lists = []
        
        for _list in list_of_lists:
            nlist = []
            pos = 0

            for p in _list:
                pos += p
                nlist.append(pos)
                
            new_list_of_lists.append(nlist)

        return new_list_of_lists      
Esempio n. 19
0
try:
    with tweetstream.SampleStream(USER, PASS) as stream:
        for tweet in stream:
            if 'text' in tweet and len(tweet['text']) > 0:
                screen_name = tweet['user']['screen_name']
                hashes = [
                    j for j in set([
                        i for i in tweet['text'].split()
                        if i.startswith('#')
                    ])
                ]
                # Strip out urls, punctuation, RTs, and @'s
                tweet_stripped = urlre.sub('', tweet['text'])
                tweet_stripped = punctre.sub('', tweet_stripped)
                tweet_stemmed = [
                    porter_stemmer.stem_word(i.lower())
                    for i in tweet_stripped.split()
                ]
                # Keep unstemmed, stripped tweet for either storage or retweeting
                tweet_outgoing = [
                    i.lower() for i in tweet_stripped.split()
                ]
                # hack to make sure that only coherent tweets are passed through
                temp_tweet = [
                    i.lower() for i in tweet_stemmed
                    if not i.lower().startswith('rt')
                ]
                tweet_for_topic = [
                    i.lower() for i in tweet_stemmed
                    if not i.lower().startswith('rt') and nsyl(i) > 0
                    and i.lower() not in stopwords
Esempio n. 20
0
wouldn't
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves""".translate(table,punc).split())


for x in nltk.corpus.words.words():
    stopwords.add(x.translate(table,punc))

stopwords = set([stemm.stem_word(x) for x in stopwords])

#get a list of the normalized words from the tweet
def split_tweet(m):
    text = str(m['text']).translate(table, punc)    
    #repattern.sub('', 
    return set([x for x in [stemm.stem_word(x.lower()) for x in text.split() if x.lower().isalpha() and len(x) > 2 and  not '#' in x and  not 'http' in x] ])

#get hashtags
def get_tags(m):
    return [stemm.stem_word(x['text'].lower()) for x in m['entities']['hashtags']]

english_vocab = set(stemm.stem_word(w.lower()) for w in nltk.corpus.words.words()) 
print 'english parsed'
#text_vocab = set(w.lower() for w in text if w.lower().isalpha()) 
#loop over all tweets found in files on @path
Esempio n. 21
0
class rule_engine:
    def __init__(self):
        ## nltk stemmer
        self.stemmer  = PorterStemmer()

        self.state_stack = []
        self.stack = []

        ## local variables
        self.Grounds = Groundings()
        self.Groundings = {}
        self.Types = {}
        
    def initialize(self, univ_sentence):
        self.tag_stack  = univ_sentence.features
        self.hypergraph = univ_sentence.hypergraph
        
    def interpreter(self, FileList):
        vm = VM('r5rs')
        
        primitive_procedures = [
            ["match-rule?", self.matchRule],
            ["rule-applied", self.rule_applied],
            ["lemma", self.lemma],
            ["in-sentence?", self.inSentence],
            ["make-link", self.addLink],
            ["reset-scope", self.Grounds.variableScope],
            ["ground", self.Grounds.groundVariable],
            ["set-link-type", self.setType],
            ["make-phrase", self.makePhrase],
            ["output-phrase", self.Output],
            ["get-groundings", self.getGroundings],
            ["has-feature?", self.hasFeature],
            ["has-flag?", self.hasFlag],
        ]
        for name, procedure in primitive_procedures:
            vm.define(name, vm.toscheme(procedure))

        for File in FileList:
            vm.load(File)
        
    def run_rules(self):
        files = ['analysis/prep-rules.scm', 'analysis/triple-rules.scm']
        self.interpreter(files)

    def getGroundings(self):
        #debug(self.Groundings)
        pass
        
    def Output(self, Ground1, Ground2, Ground3):
        #self.Groundings = self.output
        ground_1 = None
        ground_2 = None
        ground_3 = None
        
        if self.Grounds.isGround(Ground1) and self.isVariable(Ground1):
            ground_1 = self.Grounds.getGrounding(Ground1)
        elif not self.isVariable(Ground1):
            ground_1 = Ground1
            
        if self.Grounds.isGround(Ground2) and self.isVariable(Ground2):
            ground_2 = self.Grounds.getGrounding(Ground2)
        elif not self.isVariable(Ground2):
            ground_2 = Ground2
            
        if self.Grounds.isGround(Ground3) and self.isVariable(Ground3):
            ground_3 = self.Grounds.getGrounding(Ground3)
        elif not self.isVariable(Ground3):
            ground_3 = Ground3

        
        if ground_1 and ground_2 and ground_3:
            debug([ground_1, ground_2, ground_3], prefix="triple")
            
            self.hypergraph.add_edge(ground_2, ground_3,\
            edge_data=[ground_1], edge_type='triple', with_merge=False)
            
    def rule_applied(self, rule):
        #debug(rule, prefix="rule applied from scheme")
        #if hasattr(self, "output"):
        #    debug(self.output, prefix="Groundings")
        pass
    
    def hasFlag(self, flag, variable):
        ## do we have a given flag
        for x in self.hypergraph.edge_by_type('feature'):            
            head, cur_tag, tail = x
            if cur_tag[0] == flag:
                if self.isVariable(variable) and self.Grounds.isGround(variable):
                    value = self.Grounds.getGrounding(variable)
                else:
                    value = variable
                    
                if head == value:
                    return True

        return False
    
    def hasFeature(self, feature):
        for x in self.hypergraph.edge_by_type('feature'):
            head, cur_tag, tail = x
            if cur_tag == feature:
                return True

        return False
    
    def makePhrase(self, Ground1, Ground2):
        if self.Grounds.isGround(Ground1) and self.Grounds.isGround(Ground2):
            ground_1 = self.Grounds.getGrounding(Ground1)
            ground_2 = self.Grounds.getGrounding(Ground2)
            phrase = '_'.join([ground_1, ground_2])
            self.Grounds.groundVariable('$phrase', phrase)

    def inSentence(self, word):
        pass
    
    def isVariable(self, text):
        ## is it a variable?
        if text.startswith('$'):
            return True
        else:
            return False

    ## Type specifics
    def isType(self, variable):
        if self.Types.has_key(variable):
            return True
        else:
            return False
    
    def getType(self, variable):
        if self.isType(variable):
            return self.Types[variable]
        else:
            return False
    
    def setType(self, variable, Type):
        self.Types[variable] = Type

    ## Grounding specifics
    def compareGround(self, ground, idx):
        if self.Grounds.getGrounding(ground) != self.current[idx]:
            return False
        else:
            return True

    def addLink(self, ground_1, ground_2, type, data=None):
        if self.Grounds.isGround(ground_1) and self.Grounds.isGround(ground_2):
            ground_1 = self.Grounds.getGrounding(ground_1)
            ground_2 = self.Grounds.getGrounding(ground_2)
            
            self.hypergraph.add_edge(ground_1, ground_2, edge_data=[data], edge_type=type, with_merge=False)


    def matchRule(self, rule_set):
        results = []
        state   = None
        stack   = []
        
        for matches in self.match_rule_generator(rule_set):
            ## match to list then append the output
            ## find and match is a generator
            results.append(matches[0])
            state = matches[1]
            
        ## match it outright
        if rule_set == results:
            self.output = matches[1]
            return True

        return False
    
    def replaceOutput(self, ruleOutput, rule):
        if not ruleOutput:
            return False
        
        output = []
        
        for tag_set in rule:            
            replaced = self.replaceVariable(tag_set, self.output)
            output.append(replaced)

        return output
            
    def replaceVariable(self, tag_frame, groundings):
        output = tag_frame
        for x in xrange(len(tag_frame)):
            if self.isVariable(tag_frame[x]):
                if tag_frame[x] in groundings.keys():
                    output[x] = groundings[tag_frame[x]]

        return output
    
    def matchTemplate(self, rule_set, callback):
        output_stack = []
        for key, value in rule_set.items():
            match_rule = callback(value)
            if match_rule[0]:
                output = self.replaceOutput(match_rule, value)
                output_stack.append((key, output))
                
            self.reset()
                
        return output_stack
        
    def matchRuleSet(self, ruleSet):
        output_stack = []
        for key, value in ruleSet.items():
            match_list = self.matchRule(value)
            if match_list:
                output = self.replaceOutput(match_list, value)
                output_stack.append((key, output))
                
            self.reset()
            
        return output_stack
    
    def lemma(self, word, grounding):
        if self.isVariable(word) and self.Grounds.isGround(word):
            groundedWord = self.Grounds.getGrounding(word)
            stemmed = self.stemmer.stem_word(groundedWord)
            self.Grounds.groundVariable(grounding, stemmed)
            
        elif not self.isVariable(word):
            stemmed = self.stemmer.stem_word(word)
            self.Grounds.groundVariable(grounding, stemmed)

    def compareRule(self, tag, var_1, var_2):
        ## check to see if we are ground and it is a var
        if self.isVariable(var_1) and self.Grounds.isGround(var_1):
            out = self.compareGround(var_1, 0)
            if not out:
                return False
            
        ## otherwise we need to ground it
        elif self.isVariable(var_1) and not self.Grounds.isGround(var_1):
            self.Grounds.groundVariable(var_1, self.current[0])
     
        if self.isVariable(var_2) and self.Grounds.isGround(var_2):
            out = self.compareGround(var_2, 2)
            if not out:
                return False
            
        elif self.isVariable(var_2) and not self.Grounds.isGround(var_2):
            self.Grounds.groundVariable(var_2, self.current[2])
            
        if tag == '$prep':
            ## XXX: bad way of doing this
            return True
        
        if self.isVariable(tag) and self.Grounds.isGround(tag):
            self.compareGround(tag, 1)
                
        elif self.isVariable(tag) and not self.Grounds.isGround(tag):
            self.Grounds.groundVariable(tag, self.current[1])
        
        return True
    
    def find_next(self, tag):
        if self.isType(tag):
            tagType = self.getType(tag)
            ## dont play with a loaded gun
            if self.hypergraph.has_edge_type(tagType):
                for x in self.hypergraph.edge_by_type(tagType):
                    head, cur_tag, tail = x
                    edge_data = cur_tag[0]
                    self.Grounds.groundVariable(tag, edge_data)
                    self.current = (head, edge_data, tail)
                    yield (head, edge_data, tail)
            else:
                yield False            
                return
                        
        
        for x in self.hypergraph.edge_by_type('feature'):
            if not isinstance(tag, list):        
                if tag.startswith('$'):
                    head, tag, tail = x
                    self.current = (head, tag[0], tail)
                    yield self.current
                    
                elif x[1][0] == tag:
                    head, tag, tail = x
                    self.current = (head, tag[0], tail)
                    yield self.current
                    
    def match_rule_generator(self, rule_set):
        ## tag list is the list we match to
        ## it is the rule
        rule_set = deque(rule_set)
        
        ## has to be a deque with something in it
        if rule_set:
            popped = rule_set.popleft()
            self.stack.append(popped)
            ## pop off the tag list onto the stack
            tag, var_1, var_2 = self.stack[-1]

            ## find the next matching tag
            for x in self.find_next(tag):
                if x:
                    ## find our matching tag
                    match = self.compareRule(tag, var_1, var_2)
                    if match:
                        ## self.match_stack.append(self.groundings)
                        yield ([tag, var_1, var_2], self.Grounds.getGroundings())

                        ## run with recursion after yield
                        for x in self.match_rule_generator(rule_set):
                            ## check for sanity's sake
                            if len(rule_set) > 0:
                                tag, var_1, var_2 = rule_set.popleft()
                            else:
                                return
                            
                            #debug((tag, var_1, var_2))
                            match = self.compareRule(tag, var_1, var_2)
                            #debug(match)
                            if match:
                            ## dump on the stack
                                self.stack.append((tag, var_1, var_2))
                                ## self.match_stack.append(self.groundings)
                                yield ([tag, var_1, var_2], self.Grounds.getGroundings())

                            else:
                                ## reset the groundings
                                self.Grounds.variableScope()
                    else:
                        self.Grounds.variableScope()
Esempio n. 22
0
def glossOverlap(gloss1, gloss2):
	# stopws = stopwords.words('english')
	stopws = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 
	'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 
	'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 
	'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 
	'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 
	'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 
	'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 
	'in', 'out', 'on', 'off', 'over', 'under', 'then',  
	'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
	'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 
	'than', 's', 't', 'just', 'don']

	stemmer = PorterStemmer()

	# print "-"*50
	# for x in stopws:
	# 	if stemmer.stem_word(x) != x:
	# 		print x, stemmer.stem_word(x)

	def longestOverlap(a, b):
		now = [0]*len(b)
		bestOverlap = 0
		aStart = 0
		bStart = 0

		nextNonStopWord = [-1]*(len(a)+1)
		for i in range(len(a)-1, 0, -1):
			if a[i] not in stopws:
				nextNonStopWord[i] = i
			else:
				nextNonStopWord[i] = nextNonStopWord[i+1]

		for i in range(1, len(a)):
			prev = now
			now = [0]*len(b)
			if a[i] == '#':
				continue
			for j in range(1, len(b)):
				if b[j] == '#':
					continue
				if a[i] == b[j]:
					now[j] = max(now[j], prev[j-1] + 1)
					if a[i] in stopws:
						continue

					overlap = now[j]
					start = i - overlap + 1
					start = nextNonStopWord[start]
					overlap = i - start + 1
					if bestOverlap < overlap:
						bestOverlap = overlap
						aStart = i - overlap + 1
						bStart = j - overlap + 1

		return (bestOverlap, aStart, bStart)


	regex = ',|\.|\s|\?|\'|\"|!|;|-'
	#maybe check what happens if we don't stem the glosses
	a1 = ['#'] + [stemmer.stem_word(x.lower()) for x in re.split(regex, gloss1) if x]
	a2 = ['#'] + [stemmer.stem_word(x.lower()) for x in re.split(regex, gloss2) if x]

	score = 0
	(overlap, start1, start2) = longestOverlap(a1, a2)
	while overlap > 0:
		# print overlap
		# print a1[start1:start1+overlap]
		# print a2[start2:start2+overlap]
		a1[start1:start1+overlap] = ['#']
		a2[start2:start2+overlap] = ['#']
		score += overlap**2
		(overlap, start1, start2) = longestOverlap(a1, a2)

	return score