예제 #1
0
파일: stem.py 프로젝트: mardix/libmunin
class StemProvider(Provider):
    """Stem the input values (either a single word or a list of words)

    Uses the porter stemmer algorithm.
    """
    def __init__(self, language='english', **kwargs):
        """
        See here for a full list of languages:

            http://nltk.org/_modules/nltk/stem/snowball.html

        .. note::

            This does not depend on nltk, it depends on the ``pystemmer`` package.

        :param language: language to use during stemming, defaults to english.
        """
        Provider.__init__(self, **kwargs)
        self._stemmer = Stemmer(language)

    def do_process(self, input_value):
        if isinstance(input_value, str):
            return self._stemmer.stemWord(input_value)
        else:
            return self._stemmer.stemWords(input_value)
def classif(text, mass, num_all_docs, num_words_unic):
    stm = Stemmer('russian')
    text = stm.stemWords(regexp_tokenize((text.decode('UTF-8')).lower(), r"(?x) \w+ | \w+(-\w+)*"))
    num_povt_words = 0
    summa = 0
    while_iter = 0
    while while_iter < len(mass):
        summand_1 = log((mass[while_iter].num_docs + 0.0) / (num_all_docs + 0.0) + 0.0, 1.1)
        for i in text:
            for i1 in mass[while_iter].lst_allword:
                if i == i1:
                    num_povt_words = num_povt_words + 1
            summand_2 = log(((num_povt_words + 1) + 0.0) / ((num_words_unic + mass[while_iter].num_words) + 0.0), 1.1)
            num_povt_words = 0
            summa = summa + summand_2
        mass[while_iter].c = summand_1 + summa
        summa = 0
        while_iter = while_iter + 1

    max_c = -100000
    while_iter = 0
    number_max = 0

    while while_iter < len(mass):
        print mass[while_iter].c
        if mass[while_iter].c > max_c:
            max_c = mass[while_iter].c
            number_max = while_iter
        while_iter = while_iter + 1
    print mass[number_max].name_categories
예제 #3
0
def run():
    stemmer = Stemmer("english")
    pages = db.en.find()
    print colored.yellow("statistic words") 
    wordstatistic = {}
    for page in progress.bar(pages,size=db.en.count()):
        data = page.get("data")
        if not data:continue
        content = data.get("content")
        if not content:
            db.en.remove({"_id":page["_id"]})
            continue
        words = EN_WORD_CUT.split(content)
        for word in words:
            w=stemmer.stemWord(word.strip()).lower()
            if w and len(w)<20 and not w in EN_IGNORE:
                if wordstatistic.get(w):
                    wordstatistic[w]+=1
                else:
                    wordstatistic[w]=1

    
    print colored.yellow("save to en_words_freq")
    savequene = []
    for k,v in progress.bar(wordstatistic.iteritems(),size=len(wordstatistic)):
        savequene.append({"_id":k,"freq":v})
        if len(savequene) >=1000:
            db.en_words_freq.insert(savequene)
            savequene=[]
        
    if savequene:db.en_words_freq.insert(savequene)
    print colored.cyan(
            "count of en_words_freq: %d" % db.en_words_freq.count())
예제 #4
0
def getStems(cleanedText, stopWords):
    stems = {}
    matches = re.finditer(r'\w+(\.?\w+)*', cleanedText.strip(), flags=re.IGNORECASE)
    stemmer = Stemmer('english')
    #maxlength = sum(1 for _ in matches1)
    #stemmer.maxCacheSize = maxlength
    offset = len(termDict)
    tokenid = offset + 1
    position = 0
    for match in matches:
        #position = match.start()
        position += 1 
        token = match.group()
        filteredToken = filterToken(token, stopWords)
        if filteredToken and filteredToken is not None:
            wordStem = stemmer.stemWord(filteredToken.lower())
            #present = wordStem in stems
            if wordStem not in stems:
                #tokenid += 1
                stems[wordStem] = tokenid
                positions = set()
                positions.add(position)
                if wordStem not in termDict:
                    termDict[wordStem] = tokenid
                    terms[tokenid] = positions
                    tokenid = tokenid + 1
                else:
                    stemid = termDict[wordStem] 
                    terms[stemid] = positions
            else:
                stemid = termDict[wordStem]
                postns = terms[stemid]
                postns.add(position)
                terms[stemid] = postns
예제 #5
0
파일: mongosearch.py 프로젝트: ktf/DAS
 def _prepare_text(self, text):
     """Extracts and stems the words from some given text.
     """
     words = re.findall('[a-z0-9\']+', text.lower())
     words = [word for word in words if word not in STOP_WORDS]
     stemmer = Stemmer('english')
     stemmed_words = stemmer.stemWords(words)
     return stemmed_words
def train(name_file_dbase, way_to_dbase):
    stm = Stemmer('russian')
    file_base = open(name_file_dbase, 'r')
    Lines = file_base.readlines()
    num_all_docs = len(Lines) + 1

    mass = []
    iter1 = 0
    iter2 = 0

    for line in Lines:
        number1, address1 = unpack_line(line)
        number = number1.strip("\n")
        address = address1.strip("\n")
        if (number == "1"):
            mass.append(Categories())
            mass[iter1].name_categories = address1
            mass[iter1 - 1].num_docs = iter2
            iter1 = iter1 + 1
            iter2 = 0
        iter2 = iter2 + 1
    mass[len(mass) - 1].num_docs = iter2
    while_iter = 0

    file_base.close()
    number = 1

    while while_iter < len(mass):
        while number <= mass[while_iter].num_docs:
            file_forclass = open(way_to_dbase + mass[while_iter].name_categories
                                 + '/' + str(number) + 'forclass.txt', 'r')
            str_read = re.sub("^\s+|\n|\r|\s+$", ' ', file_forclass.read())
            mass[while_iter].line_allword = mass[while_iter].line_allword + str_read
            file_forclass.close()
            number = number + 1
        while_iter = while_iter + 1
        number = 1

    while_iter = 0

    while while_iter < len(mass):
        forstemmer = mass[while_iter].line_allword.decode('UTF-8')
        str_read = stm.stemWords(regexp_tokenize(forstemmer.lower(), r"(?x) \w+ | \w+(-\w+)*"))
        mass[while_iter].num_words = len(str_read)
        mass[while_iter].lst_allword = str_read
        lst_unic_words = list(set(mass[while_iter].lst_allword))
        mass[while_iter].num_wordsunic = len(lst_unic_words)
        while_iter = while_iter + 1

    all_words = 0
    num_words_unic = 0
    while_iter = 0

    while while_iter < len(mass):
        all_words = all_words + mass[while_iter].num_words
        num_words_unic = num_words_unic + mass[while_iter].num_wordsunic
        while_iter = while_iter + 1
    return mass, num_all_docs, num_words_unic
예제 #7
0
    def get_search_phrases(self, indexing_func=None):
        """Returns search phrases from properties in a given Model instance.

        Args (optional):
            only_index: List of strings.  Restricts indexing to these property names.
            indexing_func: A function that returns a set of keywords or phrases.

        Note that the indexing_func can be passed in to allow more customized
        search phrase generation.

        Two model variables influence the output of this method:
            INDEX_ONLY: If None, all indexable properties are indexed.
                If a list of property names, only those properties are indexed.
            INDEX_MULTI_WORD: Class variable that allows multi-word search
                phrases like "statue of liberty."
            INDEX_STEMMING: Returns stemmed phrases.
        """
        if not indexing_func:
            klass = self.__class__
            if klass.INDEX_MULTI_WORD:
                indexing_func = klass.get_search_phraseset
            else:
                indexing_func = klass.get_simple_search_phraseset
        if self.INDEX_STEMMING:
            stemmer = Stemmer('english')
        phrases = set()

        # allow indexing of 'subentities' such as tasks of a list as well
        queries = [(self,self.INDEX_ONLY)] + self.INDEX_SUBENTITY_QUERIES
        import logging
        for query, props in queries:
            entities = []
            try:
                subentities = query(self).fetch(1000)
                # get all of them
                while len(subentities) > 0:
                    entities.extend(subentities)
                    last_key = subentities[-1].key()
                    subentities = query(self).order('__key__').filter('__key__ >',last_key).fetch(1000)
            except TypeError, e: # query is not callable because it's an actual entity
                entities = [query]
            for entity in entities:
                for prop_name, prop_value in entity.properties().iteritems():
                    if not props or prop_name in props:
                        values = prop_value.get_value_for_datastore(entity)
                        if not isinstance(values, list):
                            values = [values]
                        if (isinstance(values[0], basestring) and
                                not isinstance(values[0], datastore_types.Blob)):
                            for value in values:
                                words = indexing_func(value,add_stop_words=self.INDEX_ADD_STOP_WORDS)
                                if self.INDEX_STEMMING:
                                    stemmed_words = set(stemmer.stemWords(words))
                                    phrases.update(stemmed_words)
                                else:
                                    phrases.update(words)
예제 #8
0
파일: textutils.py 프로젝트: jmvanel/sulci
def make_index(expression):
    """
    Make a standardization in the expression to return a tuple who maximise
    maching possibilities.
    expression must be a list or tuple
    """
    stemmer = Stemmer("french")
    expression = [stemmer.stemWord(normalize_token(w)) for w in expression]
    expression.sort()
    return tuple(expression)
예제 #9
0
def processQueries(queries):
    queryList = []
    for query in queries:
        filteredQuery = tokenize.filterToken(query, tokenize.getStopWords())
        if filteredQuery and filteredQuery is not None:
            stemmer = Stemmer('english')
            queryStem = stemmer.stemWord(filteredQuery.lower())
            queryList.append(queryStem)
    
    return queryList
예제 #10
0
    def parse_html(html):
        words = dehtml(html)

        s = Stemmer("danish")

        result = []
        for w in words.split():
            word = w.lower()
            if word in stop_words or len(word) < 2 or word.count('\\'):
                continue

            result.append(s.stemWord(word))
        return result
예제 #11
0
def getTerm(term):
    term_ids = {}
    term_ids_file = open(TERMIDSFILE, 'rU')
    
    for line in term_ids_file.readlines():
        pieces = line.strip().split('\t')
        stemmer = Stemmer('english')
        #stemmer.maxCacheSize = 1
        termStem = stemmer.stemWord(term.lower())
        if termStem == pieces[1]:
            term_ids[pieces[1]] = int(pieces[0])
            return term_ids
    
    term_ids_file.close()
    return term_ids
예제 #12
0
class BagOfWordsFeatureBooleanizer(FeatureBooleanizer):
  def __init__(self, featureName, featuresData, featureId):
    FeatureBooleanizer.__init__(self, featureName, featuresData, featureId)
    self.stemmer = Stemmer('english')
    self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
    stopListFn = './resources/general/stopword.csv'
    self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines())))
    allWords = set()
    if self.featureName == 'Basic: Tagline':
      for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split(','))))
    else:
      for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split())))
    self.words = sorted(list(filter(None, allWords - self.stopList)))
  
  def preprocess(self, s):
    chars = []
    for c in unidecode(s.strip().lower()):
      if c in self.goodChars:
        chars.append(c)
    word = ''.join(chars)
    return self.stemmer.stemWord(word)
  
  def getFeatureNames(self):
    return [self.featureName + ': ' + word for word in self.words]
  
  def process(self, v):
    vWords = set(map(lambda w: self.preprocess(w), filter(None, v.split(','))))
    return [(word in vWords) for word in self.words]
예제 #13
0
class BagOfWordsFeatureSupport(FeatureSupport):
  def __init__(self, featuresData, featureId):
    FeatureSupport.__init__(self, featuresData, featureId)
    self.stemmer = Stemmer('english')
    self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
    stopListFn = './resources/general/stopword.csv'
    self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines())))
  
  def preprocess(self, s):
    chars = []
    for c in unidecode(s.strip().lower()):
      if c in self.goodChars:
        chars.append(c)
    word = ''.join(chars)
    return self.stemmer.stemWord(word)
  
  def extract(self, i):
    bag = frozenset(map(lambda w: self.preprocess(w), filter(None, self[i].split())))
    ret = bag - self.stopList
    if len(ret) == 0: ret = frozenset([''.join(random.choice('abcdefghjiklmnopqrstuvwxyz') for _ in range(20))])
    return ret
  
  def similarity(self, a, b):
    num = len(a & b)
    den = len(a | b)
    return num / den if den != 0 else 1.0
예제 #14
0
    def stem_words(self, words: List[str]) -> List[str]:
        """Stem list of words with PyStemmer."""
        language_code = self.language_code()
        words = decode_object_from_bytes_if_needed(words)

        # Normalize apostrophe so that "it’s" and "it's" get treated identically (it's being done in
        # _tokenize_with_spaces() too but let's not assume that all tokens that are to be stemmed go through sentence
        # tokenization first)
        words = [word.replace("’", "'") for word in words]

        if language_code is None:
            raise McLanguageException("Language code is None.")

        if words is None:
            raise McLanguageException("Words to stem is None.")

        # (Re-)initialize stemmer if needed
        if self.__pystemmer is None:

            try:
                self.__pystemmer = PyStemmer(language_code)
            except Exception as ex:
                raise McLanguageException(
                    "Unable to initialize PyStemmer for language '%s': %s" % (language_code, str(ex),)
                )

        stems = self.__pystemmer.stemWords(words)

        if len(words) != len(stems):
            log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),))

        # Perl's Snowball implementation used to return lowercase stems
        stems = [stem.lower() for stem in stems]

        return stems
예제 #15
0
파일: en.py 프로젝트: JelteF/sphinx
            class Stemmer(object):
                def __init__(self):
                    # type: () -> None
                    self.stemmer = PyStemmer('porter')

                def stem(self, word):
                    # type: (unicode) -> unicode
                    return self.stemmer.stemWord(word)
예제 #16
0
    def __init__(self, language):
        """ Initializes attributes with the language provided.

        Args:
            language (str): The language used to stem ('french', 'english').

        """
        self.stemmer = Stemmer(language)
        self.stopwords = stopwords.words(language)
예제 #17
0
 def __init__(self, featureName, featuresData, featureId):
   FeatureBooleanizer.__init__(self, featureName, featuresData, featureId)
   self.stemmer = Stemmer('english')
   self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
   stopListFn = './resources/general/stopword.csv'
   self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines())))
   allWords = set()
   if self.featureName == 'Basic: Tagline':
     for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split(','))))
   else:
     for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split())))
   self.words = sorted(list(filter(None, allWords - self.stopList)))
예제 #18
0
파일: index.py 프로젝트: do3cc/Scanned-Docs
def index(text, accepted_languages=None, langs=None):
    registry = get_current_registry()
    if accepted_languages == None:
        accepted_languages = [x.strip() for x in
                              registry.settings["accepted_languages"].split(","
                              )]
    if langs == None:
        lang = guessLanguage(text)
        if lang not in accepted_languages:
            langs = accepted_languages
        else:
            langs = [lang]
    langs = list(set(langs).intersection(set(accepted_languages)))
    if not langs:
        langs = accepted_languages
    indexed_words = set()
    for lang in langs:
        stemmer = Stemmer(lang)
        indexed_words.update([stemmer.stemWord(x.value) for x in
                             tokenize(text)])
    return indexed_words
예제 #19
0
파일: stem.py 프로젝트: mardix/libmunin
    def __init__(self, language='english', **kwargs):
        """
        See here for a full list of languages:

            http://nltk.org/_modules/nltk/stem/snowball.html

        .. note::

            This does not depend on nltk, it depends on the ``pystemmer`` package.

        :param language: language to use during stemming, defaults to english.
        """
        Provider.__init__(self, **kwargs)
        self._stemmer = Stemmer(language)
예제 #20
0
class TextEater(object):
    
    def __init__(self):
        self.stoplist = gen_stops()
        self.stemmer = Stemmer('english')
    
    @coroutine
    def sent_filter(self,target):
        word = ''
        print "ready to eat lines"
        while True:
            sentence = (yield)
            target.send((sentence.lower()).split())

    @coroutine
    def word_filter(self, target):
        print "ready to eat words"
        while True:
            raw = (yield)
            target.send([self.stemmer.stemWord(w) for w in raw if len(w)<=3 or 
                    w in self.stoplist])


    @coroutine
    def ngrams(self,container, n=2,):
        "Compute n-grams" 
        while True:
            grams= (yield)
            for i in range(0, len((grams)) - (n - 1)):
                container[(tuple(grams[i:i+n]))]+=1
               
    @coroutine
    def printer(self):
        while True:
            line = (yield)
            print (line)

    @coroutine
    def typer(self,target):
        print "ready to check type"
        word = None
        while True:
            line = (yield word)
            word=  type(line)
예제 #21
0
 def __init__(self, query, db, doc_level_search=True, stemmer=False, path='/var/lib/philologic/databases/'):
     self.path = path + db + '/'
     self.words = query.split()
     self.doc_level_search = doc_level_search
     self.results = {}
     if doc_level_search:
          self.doc_path = self.path + 'doc_arrays/'
     else:
         self.doc_path = self.path + 'obj_arrays/'
     self.stemmer = stemmer
     if stemmer:
         try:
             from Stemmer import Stemmer
             self.stemmer = Stemmer(stemmer) # where stemmer is the language selected
             self.words = [self.stemmer.stemWord(word) for word in self.words]
         except KeyError:
             print >> sys.stderr, "Language not supported by stemmer. No stemming will be done."
         except ImportError:
             print >> sys.stderr, "PyStemmer is not installed on your system. No stemming will be done."            
예제 #22
0
class Overview(Feature):
  description = """
Basic: Overview
""".strip()

  def __init__(self, *args, **kwargs):
    Feature.__init__(self)
    self.stemmer = Stemmer('english')
    self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
    self.stopList = frozenset(['a', 'abaft', 'aboard', 'about', 'abov', 'absent', 'accord', 'account', 'across', 'addit', 'afor', 'after', 'against', 'ago', 'ahead', 'all', 'along', 'alongsid', 'alreadi', 'also', 'am', 'amid', 'amidst', 'among', 'amongst', 'an', 'and', 'anenst', 'ani', 'anoth', 'anybodi', 'anyhow', 'anyon', 'anyth', 'anywher', 'apart', 'apr', 'april', 'apropo', 'apud', 'are', 'around', 'as', 'asid', 'astrid', 'at', 'athwart', 'atop', 'aug', 'august', 'back', 'bad', 'bar', 'be', 'becaus', 'been', 'befor', 'begin', 'behalf', 'behest', 'behind', 'below', 'beneath', 'besid', 'best', 'better', 'between', 'beyond', 'big', 'bigger', 'biggest', 'billion', 'blah', 'bln', 'both', 'but', 'by', 'c', 'ca', 'call', 'can', 'cannot', 'cant', 'case', 'circa', 'close', 'concern', 'could', 'couldt', 'current', 'daili', 'day', 'dec', 'decemb', 'despit', 'did', 'do', 'doe', 'doesnt', 'done', 'dont', 'down', 'due', 'dure', 'each', 'eight', 'eighteen', 'eighth', 'eighti', 'eleven', 'end', 'enough', 'ever', 'except', 'exclud', 'fail', 'far', 'feb', 'februari', 'few', 'fifth', 'first', 'five', 'fiveteen', 'fivti', 'follow', 'for', 'forenenst', 'four', 'fourteen', 'fourth', 'fourti', 'fri', 'friday', 'from', 'front', 'full', 'further', 'get', 'given', 'go', 'gone', 'goot', 'had', 'hadnt', 'has', 'hasnt', 'have', 'havent', 'he', 'her', 'here', 'herself', 'high', 'higher', 'hightst', 'himself', 'his', 'how', 'hunderd', 'i', 'if', 'in', 'includ', 'insid', 'instead', 'into', 'is', 'it', 'itself', 'jan', 'januari', 'jul', 'juli', 'jun', 'june', 'just', 'last', 'late', 'later', 'latest', 'left', 'lest', 'lieu', 'like', 'littl', 'long', 'low', 'lower', 'lowest', 'made', 'make', 'mani', 'mar', 'march', 'may', 'me', 'mean', 'mid', 'midst', 'might', 'milliard', 'million', 'mine', 'minus', 'mld', 'mln', 'modulo', 'mon', 'monday', 'month', 'more', 'most', 'mth', 'much', 'must', 'my', 'myself', 'near', 'need', 'neednt', 'neither', 'never', 'next', 'nine', 'nineteen', 'nineth', 'nineti', 'no', 'none', 'nor', 'not', 'notwithstand', 'nov', 'novemb', 'number', 'o', 'oct', 'octob', 'of', 'off', 'on', 'one', 'onli', 'onto', 'oppos', 'opposit', 'or', 'order', 'other', 'ought', 'our', 'ourselv', 'out', 'outsid', 'over', 'owe', 'pace', 'past', 'per', 'place', 'plus', 'point', 'previous', 'prior', 'pro', 'pursuant', 'put', 'qua', 'rather', 'recent', 'regard', 'regardless', 'respect', 'right', 'round', 'said', 'sake', 'same', 'san', 'sat', 'saturday', 'save', 'saw', 'say', 'second', 'see', 'seen', 'sep', 'septemb', 'seven', 'seventeen', 'seventh', 'seventi', 'sever', 'shall', 'she', 'should', 'shouldnt', 'show', 'shown', 'sinc', 'six', 'sixteen', 'sixth', 'sixti', 'small', 'smaller', 'smallest', 'so', 'some', 'somebodi', 'somehow', 'someon', 'someth', 'somewher', 'soon', 'sooner', 'spite', 'start', 'still', 'subsequ', 'such', 'sun', 'sunday', 'take', 'taken', 'tell', 'ten', 'tenth', 'than', 'thank', 'that', 'the', 'their', 'them', 'themselv', 'there', 'these', 'they', 'third', 'thirteen', 'thirti', 'this', 'those', 'thousand', 'three', 'through', 'throughout', 'thru', 'thruout', 'thu', 'thursday', 'till', 'time', 'to', 'today', 'told', 'too', 'took', 'top', 'toward', 'tue', 'tuesday', 'twelv', 'twenti', 'two', 'under', 'underneath', 'unit', 'unlik', 'until', 'unto', 'up', 'upon', 'us', 'use', 'versus', 'via', 'vice', 'view', 'virtu', 'vis', 'visavi', 'vs', 'was', 'we', 'wed', 'wednesday', 'week', 'well', 'went', 'were', 'what', 'when', 'where', 'whether', 'whi', 'which', 'while', 'who', 'whose', 'will', 'with', 'within', 'without', 'wont', 'wors', 'worst', 'worth', 'would', 'wrt', 'xor', 'year', 'yes', 'yesterday', 'yet', 'you', 'your', 'yourself', 'yourselv', 'yr'])
  
  def preprocess(self, s):
    chars = []
    for c in unidecode(s.strip().lower()):
      if c in self.goodChars:
        chars.append(c)
    word = ''.join(chars)
    return self.stemmer.stemWord(word)
  
  def extract(self, m):
    t = m.overview
    return ','.join(sorted(list(set(filter(lambda w: len(w) > 0 and w not in self.stopList, map(self.preprocess, t.split()))))))
from Stemmer import Stemmer
from nltk.stem import WordNetLemmatizer

from nltk.corpus import stopwords as nltk_stopwords
from sets import Set
import pdb
from config import *
import string

STOPWORDS = Set(nltk_stopwords.words('english'))
URL_STOP_WORDS = Set([
    "http", "https", "www", "ftp", "com", "net", "org", "archives", "pdf",
    "html", "png", "txt", "redirect"
])
STEMMER = Stemmer('english')
LEMMATIZER = WordNetLemmatizer()
EXTENDED_PUNCTUATIONS = Set(list(string.punctuation) + ['\n', '\t', " "])
INT_DIGITS = Set(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"])

MAX_WORD_LEN = 10
MIN_WORD_LEN = 3


def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
예제 #24
0
파일: en.py 프로젝트: avsyap/fitbit
            class Stemmer(object):
                def __init__(self):
                    self.stemmer = PyStemmer('porter')

                def stem(self, word):
                    return self.stemmer.stemWord(word)
예제 #25
0
import re
import time
import math
import operator
from nltk.corpus import stopwords
import os
import pickle
from Stemmer import Stemmer

stop_words = set(stopwords.words('english'))
ps = Stemmer('porter')

root_path = "/Users/rishabhmurarka/Desktop/3rd SEm/IRE/Phase_2/"


def isASCII(word):
    """
    removing non-ascii charcaters from a string. If the word is a ascii character,
    then only include it into the list or dictionary or even file
    :param word: the string which is to be checked for non-ascii presence
    :return: True if the word is an ascii character ; otherwise false
    """
    try:
        word = word.encode('ascii')
    except UnicodeEncodeError:
        return False
    return True


def preprocessing(unprocessed_data):
    """
예제 #26
0
import train
import Body
from Stemmer import Stemmer
import codecs
import nltk
import re
import math
import random
from nltk.corpus import stopwords
stop_words = stopwords.words('farsi')
stop_words.append(".")
stop_words.append(":")
stop_words.append("،")
import operator
import glob
stemmer = Stemmer()


def sigmoid(x):
    return (1 / (1 + math.exp(-x)))


def words_list(paragraph, stemmed_sent):
    ''' takes a text as an input and reduce it to embbeded list of stemmed-words of sentences (without stop words)'''
    fp = codecs.open(paragraph, 'r', 'utf8')
    txt = fp.read()
    fp.close()
    sentences = txt.split('\r\n')
    #print('sentences', sentences)
    embbeded = []
    for line in sentences:
예제 #27
0
def merge_d(d1, d2):
    union = {}
    for key in set(d1.keys()).union(d2.keys()):
        union[key] = []
        if key in d1 and key not in d2:  # if the key is only in d1
            union[key] = d1[key]
        if key in d2 and key not in d1:
            union[key] = d2[key]
        if key in d1 and key in d2:
            union[key] = d1[key] + "+" + d2[key]
    return union


startTime = datetime.now()
stop_words = get_stop_words('en')
p_stemmer = Stemmer('english')
start = 900
end = 902

c = open('count_list', 'r+')
lines = c.readlines()
c.close()

for i in range(start, end):
    all_dicts = []
    pair = lines[i].split(':')
    file_num = int(pair[0]) + 1
    count = int(pair[1].split(',')[0].strip().replace('[', '')) + 1
    i_count = count
    input_file = "./output/output_%d" % file_num
    output_file = "./indices/index_%d" % file_num
예제 #28
0
파일: help.py 프로젝트: shoosen/ibid
class Help(Processor):
    usage = u"""
    what can you do|help
    help me with <category>
    how do I use <feature>
    help <(category|feature)>
    """
    feature = ('help', )
    stemmer = Stemmer('english')

    def _get_features(self):
        """Walk the loaded processors and build dicts of categories and
        features in use. Dicts are cross-referenced by string.
        """
        categories = {}
        for k, v in ibid.categories.iteritems():
            v = copy(v)
            v.update({
                'name': k,
                'features': set(),
            })
            categories[k] = v

        features = {}
        processor_modules = set()
        for processor in ibid.processors:
            for feature in getattr(processor, 'feature', []):
                if feature not in features:
                    features[feature] = {
                        'name': feature,
                        'description': None,
                        'categories': set(),
                        'processors': set(),
                        'usage': [],
                    }
                features[feature]['processors'].add(processor)
                if hasattr(processor, 'usage'):
                    features[feature]['usage'] += [
                        line.strip() for line in processor.usage.split('\n')
                        if line.strip()
                    ]
            processor_modules.add(sys.modules[processor.__module__])

        for module in processor_modules:
            for feature, meta in getattr(module, 'features', {}).iteritems():
                if feature not in features:
                    continue
                if meta.get('description'):
                    features[feature]['description'] = meta['description']
                for category in meta.get('categories', []):
                    features[feature]['categories'].add(category)
                    categories[category]['features'].add(feature)

        categories = dict(
            (k, v) for k, v in categories.iteritems() if v['features'])

        usere = re.compile(r'[\s()[\]<>|]+')
        for name, feat in features.iteritems():
            feat['usage_keywords'] = frozenset(
                self.stemmer.stemWord(word.strip())
                for word in usere.split(u' '.join(feat['usage']))
                if word.strip())
        for name, cat in categories.iteritems():
            cat['description_keywords'] = frozenset(
                self.stemmer.stemWord(word)
                for word in cat['description'].lower().split())
        for name in features.keys():
            st_name = self.stemmer.stemWord(name)
            features[st_name] = features[name]
            if st_name != name:
                del features[name]
        for name in categories.keys():
            st_name = self.stemmer.stemWord(name)
            categories[st_name] = categories[name]
            if st_name != name:
                del categories[name]

        return categories, features

    def _describe_category(self, event, category):
        """Respond with the help information for a category"""
        event.addresponse(
            u'I use the following features for %(description)s: '
            u'%(features)s\n'
            u'Ask me "how do I use ..." for more details.', {
                'description': category['description'].lower(),
                'features': human_join(sorted(category['features'])),
            },
            conflate=False)

    def _describe_feature(self, event, feature):
        """Respond with the help information for a feature"""
        output = []
        desc = feature['description']
        if desc is None:
            output.append(u'You can use it like this:')
        elif len(desc) > 100:
            output.append(desc)
            output.append(u'You can use it like this:')
        elif desc.endswith('.'):
            output.append(desc + u' You can use it like this:')
        else:
            output.append(desc + u'. You can use it like this:')

        for line in feature['usage']:
            output.append(u'  ' + line)

        event.addresponse(u'\n'.join(output), conflate=False)

    def _usage_search(self, event, terms, features):
        terms = frozenset(self.stemmer.stemWord(term) for term in terms)
        results = set()
        for name, feat in features.iteritems():
            if terms.issubset(feat['usage_keywords']):
                results.add(name)
        results = sorted(results)
        if len(results) == 1:
            self._describe_feature(event, features[results[0]])
        elif len(results) > 1:
            event.addresponse(
                u"Please be more specific. I don't know if you mean %s",
                human_join((features[result]['name'] for result in results),
                           conjunction=u'or'))
        else:
            event.addresponse(
                u"I'm afraid I don't know what you are asking about. "
                u'Ask "what can you do" to browse my features.')

    @match(r'^(?:help|features|what\s+(?:can|do)\s+you\s+do)$')
    def intro(self, event):
        categories, features = self._get_features()
        categories = filter(lambda c: c['weight'] is not None,
                            categories.itervalues())
        categories = sorted(categories, key=lambda c: c['weight'])
        event.addresponse(
            u'I can help you with: %s.\n'
            u'Ask me "help me with ..." for more details.',
            human_join(c['description'].lower() for c in categories),
            conflate=False)

    @match(r'^help\s+(?:me\s+)?with\s+(.+)$')
    def describe_category(self, event, terms):
        categories, features = self._get_features()
        termset = frozenset(
            self.stemmer.stemWord(term) for term in terms.lower().split())

        if len(termset) == 1:
            term = list(termset)[0]
            exact = [c for c in categories.itervalues() if c['name'] == term]
            if exact:
                self._describe_category(event, exact[0])
                return

        results = []
        for name, cat in categories.iteritems():
            if termset.issubset(cat['description_keywords']):
                results.append(name)

        if len(results) == 0:
            for name, cat in categories.iteritems():
                if terms.lower() in cat['description'].lower():
                    results.append(name)

        results.sort()
        if len(results) == 1:
            self._describe_category(event, categories[results[0]])
            return
        elif len(results) > 1:
            event.addresponse(
                u"Please be more specific, I don't know if you mean %s.",
                human_join(
                    ('%s (%s)' % (categories[r]['description'].lower(), r)
                     for r in results),
                    conjunction=u'or'))
            return

        event.addresponse(
            u"I'm afraid I don't know what you are asking about. "
            u'Ask "what can you do" to browse my features.')

    @match(r'^(?:help|usage|modinfo)\s+(\S+)$')
    def quick_help(self, event, terms):
        categories, features = self._get_features()
        terms = frozenset(terms.lower().split())
        if len(terms) == 1:
            term = list(terms)[0]
            exact = [c for c in categories.itervalues() if c['name'] == term]
            if exact:
                self._describe_category(event, exact[0])
                return
            exact = [f for f in features.itervalues() if f['name'] == term]
            if exact:
                self._describe_feature(event, exact[0])
                return

        self._usage_search(event, terms, features)

    @match(r'^how\s+do\s+I(?:\s+use)?\s+(.+)$')
    def describe_feature(self, event, feature):
        categories, features = self._get_features()

        feature = feature.lower()
        exact = [f for f in features.itervalues() if f['name'] == feature]
        if exact:
            self._describe_feature(event, exact[0])
        else:
            self._usage_search(event, frozenset(feature.split()), features)

    @match(r'^\s*(?:help\s+me\s+with|how\s+do\s+I(?:\s+use)?)\s+\.\.\.\s*$',
           version='deaddressed')
    def silly_people(self, event):
        event.addresponse(
            u'You must replace the ellipsis with the thing you are after')
예제 #29
0
import zlib
from collections import *
import xml.etree.cElementTree as et
import re
import os
from Stemmer import Stemmer
import time

wikiFilePath = input("Please Enter path to wiki XML file :\n")
current_directory = os.getcwd()
baseDirectory = os.path.join(current_directory, r'TemporaryIndex/')
if not os.path.exists(baseDirectory):
    os.makedirs(baseDirectory)

start_time1 = time.time()
stemmer = Stemmer("english")
pattern = re.compile("[^a-zA-Z]")  # pattern for splitting text
stop_words = {}  # words that are not significant
stop_words_file = open("Stop_words.txt", "r")
content = stop_words_file.read()
content = re.split(",", content)
for word in content:
    if word:
        stop_words[word] = True

words_index = defaultdict(list)
inTitle = 0  # indicator for title hit
inSubTitle = 1  # indicator for sub title hit
inCategory = 2  # indicator for category hit
inText = 3  # indicator for text hit
예제 #30
0
            class Stemmer(object):
                def __init__(self):
                    self.stemmer = PyStemmer('porter')

                def stem(self, word):
                    return self.stemmer.stemWord(word)
예제 #31
0
file = open(
    "C:\\Users\Administrator\\Desktop\\myfolder\\corpora\\stats\\ielts-7to11-some.txt"
)
raw = file.read()

try:
    wordlist = nltk.word_tokenize(raw)

    lemmatizer = WordNetLemmatizer()
    print lemmatizer.lemmatize("ran")
    lanster = LancasterStemmer()
    porter = PorterStemmer()
    snowball = SnowballStemmer("english")
    isri = ISRIStemmer()
    rslp = RSLPStemmer()
    porter2 = Stemmer('english')

    endOfString = StringEnd()
    prefix = oneOf(
        "uni inter intro de con com anti pre pro per an ab ad af ac at as re in im ex en em un dis over sub syn out thermo philo geo for fore back"
    )
    suffix = oneOf("ish")
    #suffix = oneOf("or er ed ish ian ary ation tion al ing ible able ate ly ment ism ous ness ent ic ive "
    #               "ative tude ence ance ise ant age cide ium ion")

    word = (Optional(prefix)("prefixes") +
            SkipTo(suffix | suffix + FollowedBy(endOfString)
                   | endOfString)("root") +
            ZeroOrMore(suffix | suffix + FollowedBy(endOfString))("suffix"))
    #word = (Optional(prefix)("prefixes") + SkipTo(FollowedBy(endOfString))("root"))
예제 #32
0
 def stemmer(self):
     if not hasattr(self, '_stemmer'):
         from Stemmer import Stemmer
         self._stemmer = Stemmer(self.lang)
     return self._stemmer
예제 #33
0
def stemmer(listofTokens):                                          #Stemming
  stemmer=Stemmer("english")
  stemmedWords=[ stemmer.stemWord(key) for key in listofTokens ]
  return stemmedWords
예제 #34
0
class Lemmatizer(object):

    def __init__(self, words_file=default_words, verbs_file=default_verbs, joined_verb_parts=True):
        self.verbs = {}
        self.stemmer = Stemmer()

        tokenizer = WordTokenizer(words_file=default_words, verbs_file=verbs_file)
        self.words = tokenizer.words

        if verbs_file:
            self.verbs['است'] = '#است'
            for verb in tokenizer.verbs:
                for tense in self.conjugations(verb):
                    self.verbs[tense] = verb
            if joined_verb_parts:
                for verb in tokenizer.verbs:
                    bon = verb.split('#')[0]
                    for after_verb in tokenizer.after_verbs:
                        self.verbs[bon + 'ه_' + after_verb] = verb
                        self.verbs['ن' + bon + 'ه_' + after_verb] = verb
                    for before_verb in tokenizer.before_verbs:
                        self.verbs[before_verb + '_' + bon] = verb

    def lemmatize(self, word, pos=''):
        if not pos and word in self.words:
            return word

        if (not pos or pos == 'V') and word in self.verbs:
            return self.verbs[word]

        if pos.startswith('AJ') and word[-1] == 'ی':
            return word

        if pos == 'PRO':
            return word

        if word in self.words:
            return word

        stem = self.stemmer.stem(word)
        if stem and stem in self.words:
            return stem

        return word

    def conjugations(self, verb):

        past, present = verb.split('#')
        ends = ['م', 'ی', '', 'یم', 'ید', 'ند']

        if verb == '#هست':
            return ['هست' + end for end in ends] + ['نیست' + end for end in ends]

        past_simples = [past + end for end in ends]
        past_imperfects = ['می‌' + item for item in past_simples]
        ends = ['ه‌ام', 'ه‌ای', 'ه', 'ه‌ایم', 'ه‌اید', 'ه‌اند']
        past_narratives = [past + end for end in ends]

        imperatives = ['ب' + present, 'ن' + present]

        if present.endswith('ا') or present in ('آ', 'گو'):
            present = present + 'ی'

        ends = ['م', 'ی', 'د', 'یم', 'ید', 'بودم', 'ند']
        present_simples = [present + end for end in ends]
        present_imperfects = ['می‌' + item for item in present_simples]
        present_subjunctives = [item if item.startswith('ب') else 'ب' + item for item in present_simples]
        present_not_subjunctives = ['ن' + item for item in present_simples]

        with_nots = lambda items: items + list(map(lambda item: 'ن' + item, items))
        aa_refinement = lambda items: list(map(lambda item: item.replace('بآ', 'بیا').replace('نآ', 'نیا'), items)) if \
            items[0].startswith('آ') else items
        return aa_refinement(
            with_nots(past_simples) + with_nots(present_simples) + with_nots(past_imperfects) + with_nots(
                past_narratives) + with_nots(present_simples) + with_nots(
                present_imperfects) + present_subjunctives + present_not_subjunctives + imperatives)
예제 #35
0
class Searcher(object):
    """Run a search on documents or objects within documents
    in the SQLite table
    Three scoring options are available: Frequency, TF-IDF and BM25
    Two methods of incrementing the scores of results are available:
    simple addition or best score"""
    
    
    def __init__(self, query, db, doc_level_search=True, stemmer=False, path='/var/lib/philologic/databases/'):
        self.path = path + db + '/'
        self.words = query.split()
        self.doc_level_search = doc_level_search
        self.results = {}
        if doc_level_search:
             self.doc_path = self.path + 'doc_arrays/'
        else:
            self.doc_path = self.path + 'obj_arrays/'
        self.stemmer = stemmer
        if stemmer:
            try:
                from Stemmer import Stemmer
                self.stemmer = Stemmer(stemmer) # where stemmer is the language selected
                self.words = [self.stemmer.stemWord(word) for word in self.words]
            except KeyError:
                print >> sys.stderr, "Language not supported by stemmer. No stemming will be done."
            except ImportError:
                print >> sys.stderr, "PyStemmer is not installed on your system. No stemming will be done."            
        
    def get_hits(self, word, doc=True):
        """Query the SQLite table and return a list of tuples containing the results"""
        cursor = sqlite_conn(self.path + 'hits_per_word.sqlite')
        if self.doc_level_search:
            cursor.execute('select doc_id, word_freq, total_words from doc_hits where word=?', (word,))
        else:
            cursor.execute('select obj_id, word_freq, total_words from obj_hits where word=?', (word,))
        return cursor.fetchall()
        
    def id_to_word(self, id):
        """Return the word given its ID"""
        m = mapper(self.path)
        return m[id]
        
    def get_idf(self, hits):
        """Return IDF score"""
        total_docs = doc_counter(self.doc_path) #### WRONG COUNT
        try:
            return log(float(total_docs) / float(len(hits))) + 1
        except ZeroDivisionError:
            return 0
               
    def search(self, measure='tf_idf', scoring='simple_scoring', intersect=False, display=10):
        """Searcher function"""
        self.intersect = False
        if self.words != []:
            for word in self.words:
                hits = self.get_hits(word)
                getattr(self, measure)(hits, scoring)
                if intersect:
                    if self.intersect:
                        self.docs = self.docs.intersection(self.new_docs)
                        self.new_docs = set([])
                    else:
                        self.intersect = True
                        self.docs = set([obj_id for obj_id in self.results])
                        self.new_docs = set([])
            if intersect:
                self.results = dict([(obj_id, self.results[obj_id]) for obj_id in self.results if obj_id in self.docs])
            return sorted(self.results.iteritems(), key=itemgetter(1), reverse=True)[:display]
        else:
            return []
    
    def debug_score(self, hits, scoring):
        for obj_id, word_freq, word_sum in hits:
            getattr(self, scoring)(obj_id, word_freq)
    
    def tf_idf(self, hits, scoring):
        idf = self.get_idf(hits)
        for obj_id, word_freq, word_sum in hits:
            tf = float(word_freq) / float(word_sum)
            score = tf * idf
            getattr(self, scoring)(obj_id, score)
                    
    def frequency(self, hits, scoring):
        for obj_id, word_freq, word_sum in hits:
            score = float(word_freq) / float(word_sum)
            getattr(self, scoring)(obj_id, score)
                    
    def bm25(self, hits, scoring, k1=1.2, b=0.75):
        ## a floor is applied to normalized length of doc
        ## in order to diminish the importance of small docs
        ## see http://xapian.org/docs/bm25.html
        idf = self.get_idf(hits)
        avg_dl = avg_doc_length(self.path)
        for obj_id, word_freq, obj_length in hits:
            tf = float(word_freq)
            dl = float(obj_length)
            temp_score = tf * (k1 + 1.0)
            temp_score2 = tf + k1 * ((1.0 - b) + b * floor(dl / avg_dl))
            score = idf * temp_score / temp_score2
            getattr(self, scoring)(obj_id, score)
                    
    def simple_scoring(self, obj_id, score):
        if self.intersect:
            self.new_docs.add(obj_id)
        if obj_id not in self.results:
            self.results[obj_id] = score
        else:
            self.results[obj_id] += score
    
    def dismax_scoring(self, obj_id, score):
        if self.intersect:
            self.new_docs.add(obj_id)
        if obj_id not in self.results:
            self.results[obj_id] = score
        else:
            if score > self.results[obj_id]:
                self.results[obj_id] = score
                
    def lda_search(self, measure='tf_idf', scoring='simple_scoring', intersect=False, display=10):
        """Searcher function"""
        self.intersect = False
        self.words = [words.decode('utf-8') for words in self.words]
        if self.words != []:
            lda_query = self.match_topic()
            if lda_query != None:
                for word in self.words[:1]:  # temporary slice, to offer it as an option?
                    lda_query[word] = sum([lda_query[term] for term in lda_query])
                print lda_query
                self.num_hits = {}
                for other_word, freq in lda_query.iteritems():
                    hits = self.get_hits(other_word)
                    results = self.lda_scoring(hits, scoring, freq, measure)
                self.results = dict([(obj_id, self.results[obj_id] * self.num_hits[obj_id]) for obj_id in self.results if self.num_hits[obj_id] > 1])
                return sorted(self.results.iteritems(), key=itemgetter(1), reverse=True)[:display]
            else:
                return []
        else:
            return []
            
    def match_topic(self):
        topic_id = int
        cursor = sqlite_conn(self.path + 'lda_topics.sqlite')
        if len(self.words) == 1:
            cursor.execute('select topic, position from word_position where word=? order by position', (self.words[0],))
            try:
                topic_id = cursor.fetchone()[0]
            except TypeError:
                return None
        else:
            topic_pos = {}
            topic_matches = {}
            query = 'select topic, position from word_position where word="%s"' % self.words[0]
            for word in self.words[1:]:
                query += ' or word="%s"' % word
            cursor.execute(query)
            for topic, position in cursor.fetchall():
                if topic not in topic_pos:
                    topic_pos[topic] = position
                    topic_matches[topic] = 1
                else:
                    topic_pos[topic] += position
                    topic_matches[topic] += 1
            word_num = len(self.words)
            topics = [(topic, topic_pos[topic]) for topic in topic_pos if topic_matches[topic] == word_num]
            if topics == []:
                topics = [(topic, topic_pos[topic]) for topic in topic_pos if topic_matches[topic] == word_num - 1]
            topic_id = sorted(topics, key=itemgetter(1))[0][0]
        cursor.execute('select words from topics where topic=?', (topic_id,))
        results = json.loads(cursor.fetchone()[0])
        topic = [(term, float(freq)) for term, freq in results.iteritems()]# if float(freq) > 0.01]
        topic = dict(sorted(topic, key=itemgetter(1), reverse=True)[:10])
        return topic
        
    def lda_scoring(self, hits, scoring, freq, measure):
        if measure == 'tf_idf':
            idf = self.get_idf(hits)
            for obj_id, word_freq, word_sum in hits:
                tf = float(word_freq) / float(word_sum)
                score = tf * idf * freq
                if obj_id not in self.results:
                    self.results[obj_id] = score
                    self.num_hits[obj_id] = 1
                else:
                    self.results[obj_id] += score    
                    self.num_hits[obj_id] += 1
        else:
            idf = self.get_idf(hits)
            avg_dl = avg_doc_length(self.path)
            k1 = 1.2
            b = 0.75
            for obj_id, word_freq, obj_length in hits:
                tf = float(word_freq)
                dl = float(obj_length)
                temp_score = tf * (k1 + 1.0)
                temp_score2 = tf + k1 * ((1.0 - b) + b * floor(dl / avg_dl))
                score = idf * temp_score / temp_score2 * freq
                if obj_id not in self.results:
                    self.results[obj_id] = score
                    self.num_hits[obj_id] = 1
                else:
                    self.results[obj_id] += score    
                    self.num_hits[obj_id] += 1
def merge_d(d1, d2):
    union = {}
    for key in set(d1.keys()).union(d2.keys()):
        union[key] = []
        if key in d1 and key not in d2:  # if the key is only in d1
            union[key] = d1[key]
        if key in d2 and key not in d1:
            union[key] = d2[key]
        if key in d1 and key in d2:
            union[key] = d1[key] + "+" + d2[key]
    return union


startTime = datetime.now()
stop_words = get_stop_words('en')
p_stemmer = Stemmer('english')
start = 900
end = 910

c = open('count_list', 'r+')
lines = c.readlines()
c.close()

for i in range(start, end):
    all_dicts = []
    pair = lines[i].split(':')
    file_num = int(pair[0]) + 1
    count = int(pair[1].split(',')[0].strip().replace('[', '')) + 1
    i_count = count
    input_file = "./output/output_%d" % file_num
    output_file = "./indices/index_%d" % file_num
예제 #37
0
from __future__ import print_function
import xml.etree.ElementTree as etree
import re, os, heapq, math, operator, string, time, sys
from collections import *
from Stemmer import Stemmer as PyStemmer
import glob

reload(sys)
sys.setdefaultencoding('utf-8')
ps = PyStemmer('porter')

if (len(sys.argv[1:]) < 1):
    print("Needs 1 argument, the index directory")
    sys.exit()

indexDirPth = sys.argv[1]
# qryTxtFlPth = sys.argv[2]
# outTxtFlPth = sys.argv[3]

# if not os.path.exists(outTxtFlPth):
#     with open(outTxtFlPth, 'w+'): pass
# else:
# 	open(outTxtFlPth, 'w').close()

absltPthCurrPrgrm = os.path.abspath(os.path.dirname(sys.argv[0]))
###########################################################################

stopwords = dict()
inverted_index_file, mapping, doc_offset = list(), list(), list()
inverted_index_file.append(
    open(os.path.join(indexDirPth, 'title/final.txt'), 'r'))
예제 #38
0
			loc = mid
			break

		else:
			loc = mid
			break

		mid = (start + end) // 2
	if numbers[mid] == key:
		return mid
	if numbers[mid] < key:
		return mid 
	return mid -1


stemObj=Stemmer('porter')
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("\w+|\$[\d\.]+|\S+")
# tokenize text
def tokenizeText(textInput):
	normalized=[]
	#textInput=removeURL(textInput)
	#tokens = re.findall(r"\w+(?:'\w+)?|[^\w\s]", textInput)
	tokens=re.split(r'[^A-Za-z0-9]+',textInput)
	#tokens = [x for x in tokens if re.match(r"^[a-z]+$", x.lower())]
	for token in tokens:
		token=token.lower()
		token=token.lstrip('0')
		word=stemObj.stemWord(token)
		if word in STOP_WORDS or len(word)<=1:
			continue
예제 #39
0
                if (word_dict[word][page][3]):
                    arr.append('i' + str(word_dict[word][page][3]))
                if (word_dict[word][page][4]):
                    arr.append('r' + str(word_dict[word][page][4]))
                if (word_dict[word][page][5]):
                    arr.append('e' + str(word_dict[word][page][5]))
            line = "".join(arr)
            f.write((line + '\n'))
    f.close()
    f = open("./titles/title_" + str(sys.argv[2]) + ".txt",
             'w',
             encoding="utf-8")
    for title in titles:
        f.write(title + '\n')
    f.close()


start = time.time()
reg1 = re.compile('[A-Za-z0-9]+')
reg2 = re.compile("\[\[Category:(.*)\]\]")
reg3 = re.compile("\[.*?\]")
reg4 = re.compile('\{\{\s*Infobox ((.*?\n)*?) *?\s*\}\}')
ps = Stemmer("porter")
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
Handler = PageHandler()
parser.setContentHandler(Handler)
parser.parse(("./Data/" + sys.argv[1]))
write_to_disk(Handler.word_dict, Handler.titles)
print(time.time() - start)
print(Handler.page_no)
예제 #40
0
"""Text Processing
"""
import re
from collections import defaultdict
from Stemmer import Stemmer

STOP_WORDS = defaultdict(int)
FP = open("stop_words.txt", "r")
for l in FP:
    l = l.strip()
    l = l.lower()
    STOP_WORDS[l] = 1
FP.close()

STEMMER = Stemmer("english")
TAGS = [
    "<sup>", "#REDIRECT", "format=", "dts", "dmy", "colspan", "</sup>",
    "<big>", "</big>", "<small>", "</small>", "</tr>", "<br>", "<br />",
    "<center>", "</center", "</abbr>", "<abbr", "<code>", "</code>", "<div>",
    "</div>", "<imagemap>", "</imagemap>", "<gallery>", "</gallery>"
]

NOT_BODY = [
    "==See also==", "== See also ==", "== References ==",
    "==References and sources==", "==References==", "== Bibliography ==",
    "==External links==", "== External links ==", "{{Infobox", "[[Category"
]


def stem(sentence):
    """Stems the sentence"""
예제 #41
0
파일: translate.py 프로젝트: weishi/cs124
def lmScoring( sentence ):
	# candidates is the list of candiate sentences formed by trying
	# all possible definitions of all words with >1 translation
	stemmer = Stemmer()
	stemmer.DICT = dict
	candidates = []

	tokens = asTokens( sentence )
	for i in range( len(tokens) ):

		word = tokens[i]

		if word.lower() in dict:

			translations = dict[word.lower()]
			pos = POSTAG[word.lower()]

			# print 'word:',word,', pos:',pos,', dictionary:',translations

			if pos == 'V':
				try:
					stemmer_translations = stemmer.input([word.lower()])
					# print 'stemmer returned: ',stemmer_translations
					if stemmer_translations:
						translations = [stemmer_translations]
				except:
					pass
					# print 'stemmer threw exception on: ', word.lower()



			old_candidates = candidates[:]
			candidates = []
			# print 'old_candidates:', old_candidates

			k = len(translations)
			if k > 1:
				# for idx in range(len(candidates)):
				# 	for t in range(len(translations)):
				if len(old_candidates) == 0:
					for k in range(len(translations)):
						candidates.append( [translations[k]] )
				else:
					for k in range(len(translations)):
						for c in old_candidates:
							# print 'c in old_candidates:',c
							cnew = c + [translations[k]]
							# print cnew
							candidates.append( cnew )
			else:
				# append the current word to all candidate
				# sentences
				if len(old_candidates) == 0:
					candidates.append( [translations[0]] )
				else:
					for c in old_candidates:
						# print 'c in old_candidates:',c
						cnew = c + [translations[0]]
						# print cnew
						candidates.append( cnew )
					# print [c.extend(translations[0]) for c in old_candidates]
					# candidates.extend(  [c.extend(translations[0]) for c in old_candidates] )

		else:

			# print 'CANDIDATES (',len(candidates),')'
			# print candidates
			# print word, "NOT IN DICTIONARY"
			# words not in dictionary pass through untranslated
			translations = [word]

			old_candidates = candidates[:]
			candidates = []

			if len(old_candidates) == 0:
				candidates.append( [translations[0]] )
			else:
				for c in old_candidates:
					cnew = c + [translations[0]]
					candidates.append( cnew )
			# print 'CANDIDATES (',len(candidates),')'
			# print candidates


	neglobprob = [lm.sentenceProbability( ' '.join(cs) ) for cs in candidates ]
	# print neglobprob
	bestSentence = candidates[ neglobprob.index( min(neglobprob) ) ]
	# print 'CANDIDATES (',len(candidates),')'
	# for c in candidates:
	# 	print ' '.join(c)
	# print 'bestSentence='
	# print ' '.join(bestSentence)
	return ' '.join(bestSentence)
예제 #42
0
from Stemmer import Stemmer
import sys
import re, os
import math
from collections import defaultdict
from copy import deepcopy
import subprocess

st = Stemmer('english')
pattern = re.compile(
    r'[\d+\.]*[\d]+|[^\w]+'
)  #pattern to detect numbers (real/integer) non alphanumeric (no underscore)

Summary = []
lamda = 6
alpha = 0.75
#stopword dictionary from "stopwords.txt" file

stopWordDict = defaultdict(int)
stopWordFile = open("./stopwords.txt", "r")
for line in stopWordFile:
    stopWordDict[line.strip()] = 1


def extractDocumentCorpus(folder):
    os.chdir(folder)
    print folder
    document_to_senctence_corpus = {}
    for each_file in os.listdir('.'):
        print each_file
        fileptr = open(each_file, 'r')
예제 #43
0
import json
import pickle as pkl
import requests
from requests import utils
import ast

from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
stop_words = set(stopwords.words("english"))

requests.packages.urllib3.disable_warnings()
from nltk.stem import SnowballStemmer, PorterStemmer
from Stemmer import Stemmer
en_stemmer = SnowballStemmer('english')
porter_stemmer = PorterStemmer()
port_pystemmer = Stemmer('porter')
en_pystemmer = Stemmer('english')
stem_words = {}
words_dict = {}
title_dict = {}
total_num_tokens = 0

hindi_places_data = {}
with open('places_dataset_3.json') as f:
    infot = json.load(f)
    mapping = {
        info['hi_wikipedia_title']: ind
        for ind, info in enumerate(infot['data'])
    }
    for info in infot['data']:
        hindi_places_data[info['hi_wikipedia_title']] = info['wd_id']
예제 #44
0
파일: en.py 프로젝트: lelit/sphinx
 def __init__(self):
     self.stemmer = PyStemmer("porter")
예제 #45
0
from nltk.corpus import stopwords
#from nltk.stem import PorterStemmer
from Stemmer import Stemmer
from string import punctuation
from nltk.tokenize import wordpunct_tokenize
import time
import sys
import errno
import heapq
import shutil

stop_words = set(stopwords.words('english'))

stop_words.update(list(char for char in punctuation))

stemmer = Stemmer('english')

text_punc = list(punc for punc in punctuation
                 if punc not in ['{', '}', '=', '[', ']'])

text_punc.append('\n')

# words_left = ['{', '}', '=', '[', ']' ]


def writing_to_file(Inverted_Index, File_count, file_path):
    path_to_write = os.path.join(file_path, str(File_count) + '.txt')
    #print("File",str(File_count))
    value = list()
    file_pointer = open(path_to_write, 'w+')
    for term in sorted(Inverted_Index):
예제 #46
0
 def __init__(self):
     self.stoplist = gen_stops()
     self.stemmer = Stemmer('english')
예제 #47
0
from __future__ import print_function
import xml.etree.ElementTree as etree
import re, sys, os, heapq, math
from collections import *
from Stemmer import Stemmer as PyStemmer
import glob

reload(sys)
sys.setdefaultencoding('utf-8')
ps = PyStemmer('porter')

if (len(sys.argv[1:]) < 2):
    print("needs 3 arguments")
    sys.exit()

pathWikiXML = sys.argv[1].strip()
outputDirPth = sys.argv[2].strip()
if not os.path.exists(outputDirPth):
    os.makedirs(outputDirPth)

absltPthCurrPrgrm = os.path.abspath(os.path.dirname(sys.argv[0]))
# print("existential question ",os.path.exists(outputDirPth))
# print(pathWikiXML)
# file = sys.argv[0]
# pathname = os.path.dirname(file)
##########################################################################

stopwords, allwords = dict(), dict()

prntLst = ['t', 'p', 'c']
dir_names = ["title", "text", "category"]
예제 #48
0
파일: en.py 프로젝트: JelteF/sphinx
 def __init__(self):
     # type: () -> None
     self.stemmer = PyStemmer('porter')
예제 #49
0
파일: en.py 프로젝트: avsyap/fitbit
 def __init__(self):
     self.stemmer = PyStemmer('porter')
예제 #50
0
 def __setstate__(self, state):
     self.stemmer = Stemmer('russian')
     self.word_to_idx, self.idx_to_word = state
예제 #51
0
import nltk
nltk.download('stopwords')
from Stemmer import Stemmer
from nltk.corpus import stopwords
from collections import OrderedDict
from pathlib import Path
import os
import bisect
import math
import sys
import nltk
from Stemmer import Stemmer
from collections import Counter

PageCount=0
stemmer = Stemmer('porter')
stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

nltk.download('stopwords')
SecondaryIndex=[]
WordPageFreq={}
WordPageId={}
WordIdf={}
WordTfIdf={}
IdTitles={}
TopKwords=0
stemmer = Stemmer('porter')

def getTitles():
  f=open("indexfiles/titles.txt","r")
예제 #52
0
class StemCorpus(Corpus):
    def __init__(self):
        super().__init__()
        self.stemmer = Stemmer('russian')

    def __getstate__(self):
        return self.word_to_idx, self.idx_to_word

    def __setstate__(self, state):
        self.stemmer = Stemmer('russian')
        self.word_to_idx, self.idx_to_word = state

    def encode_word(self, word):
        stem_form = self.stemmer.stemWord(word.lower())
        return self.word_to_idx.get(stem_form, len(self.idx_to_word) - 1)

    def build(self, sentences, vocabulary_size=50000, log_every=100000):
        print('= Start building vocabulary')
        vocab = defaultdict(int)
        saved_sentences = []
        for i, s in enumerate(sentences, 1):
            line = s.lower().split()
            for tok in line:
                if tok in PUNKT_TAGS:
                    continue
                stem_form = self.stemmer.stemWord(tok.lower())
                vocab[stem_form] += 1
            if i % log_every == 0:
                print('--- Processed {} sentences'.format(i))
            saved_sentences.append(line)

        print('= Built vocabulary with size {}'.format(len(vocab)))
        if vocabulary_size < len(vocab):
            print('= Trim it to {}'.format(vocabulary_size))
        word_freq = list(
            map(itemgetter(0),
                sorted(vocab.items(), key=_freq_sorter, reverse=True)))
        word_freq = word_freq[:vocabulary_size]

        print('Top 10 most frequent words: {}'.format(', '.join(
            word_freq[:10])))
        print('Top 10 least frequent words: {}'.format(', '.join(
            word_freq[-10:])))

        print('= Building word to index mapping')
        if Tag.NUM not in word_freq:
            word_freq[-2] = Tag.NUM

        if Tag.ENG not in word_freq:
            word_freq[-1] = Tag.ENG

        assert Tag.EOS not in word_freq
        word_freq.append(Tag.EOS)

        assert Tag.UNK not in word_freq
        word_freq.append(Tag.UNK)

        self.idx_to_word.clear()
        self.word_to_idx.clear()
        for w in word_freq:
            self.word_to_idx[w] = len(self.idx_to_word)
            self.idx_to_word.append(w)

        print('= Built mappings')
        print('idx_to_word size = {}, word_to_idx size = {}'.format(
            len(self.idx_to_word), len(self.word_to_idx)))
예제 #53
0
def stemmer(listofTokens):  #Stemming
    stemmer = Stemmer("english")
    stemmedWords = [stemmer.stemWord(key) for key in listofTokens]
    return stemmedWords
예제 #54
0
 def __init__(self):
     super().__init__()
     self.stemmer = Stemmer('russian')
예제 #55
0
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.pipeline import FeatureUnion
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
import pandas
from Stemmer import Stemmer

stem = Stemmer('english')
stop_words = ['.', ',']

stop_words = set(stopwords.words('english'))
stop_words.add('.')
stop_words.add('I')

f = open('../../data/processed_data.txt', 'r')
q = open('../../data/queries.txt', 'r')
o = open('../../data/options.txt', 'r')
a = open('../../data/answers.txt', 'r')
tmp = open('tmp.txt', 'wa+')
nouns = []
WINDOW = 20
TEST_SIZE = 40000
data = ""
예제 #56
0
from heapq import heappush, heappop
import sys
from math import log10
from pympler.asizeof import asizeof

# In[3]:

stop_words = set(stopwords.words('english'))


def stopWords(listOfWords):  #Stop Words Removal
    temp = [key for key in listOfWords if key not in stop_words]
    return temp


ps = Stemmer("english")


def myTokenizer(text):
    words = re.split(r'(\b[^-\s]+\b)((?<=\.\w).)?', text)
    tok = [i for i in words if i != None and i != " " and i != ""]
    tok = [
        word.lower() for word in tok
        if re.match('^[a-zA-Z0-9\'-.]+$', word) and
        not re.match('^[\',-_]+$', word) and not re.match('^[^\w]+$', word)
    ]
    fin_tok = []
    for t in tok:
        fin_tok.append(re.sub("[\+*=&$@/(),.\-!?:]+", '', t))
    fin_tok = [i for i in fin_tok if i != None and i != " " and i != ""]
    return fin_tok
예제 #57
0
def text_cleaner(text):
    text = text.lower() # приведение в lowercase 
    stemmer = Stemmer('russian')
    text = ' '.join( stemmer.stemWords( text.split() ) ) 
    text = re.sub( r'\b\d+\b', ' digit ', text ) # замена цифр 
    return  text 
예제 #58
0
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.pipeline import FeatureUnion
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
import pandas
from Stemmer import Stemmer 

stem = Stemmer('english')
stop_words = ['.', ',']

stop_words = set(stopwords.words('english'))
stop_words.add('.')
stop_words.add('I')

f = open('../../data/processed_data.txt', 'r')
q = open('../../data/queries.txt', 'r')
o = open('../../data/options.txt', 'r')
a = open('../../data/answers.txt', 'r')
tmp =  open('tmp.txt', 'wa+')
nouns=[]
WINDOW=7
TEST_SIZE=40000
data=""
예제 #59
0
 def __init__(self):
     self.stemmer = PyStemmer('porter')
예제 #60
0
TYPES = (('gismu', 'Root words.'), ('cmavo', 'Particles.'),
         ('cmavo-compound', 'Particle combinations.'), ('lujvo',
                                                        'Compound words.'),
         ('experimental gismu',
          'Non-standard root words.'), ('experimental cmavo',
                                        'Non-standard particles.'),
         ("fu'ivla", 'Loan words.'), ('cmene', 'Names.'), ('cmevla', 'Names.'),
         ('bu-letteral', 'Letters.'), ('zei-lujvo',
                                       'Compound words with ZEI.'),
         ('obsolete cmevla', 'Obsolete names.'), ('obsolete cmene',
                                                  'Obsolete names.'),
         ('obsolete cmavo', 'Obsolete particles.'), ("obsolete fu'ivla",
                                                     'Obsolete loan words.'),
         ('obsolete zei-lujvo', 'Obsolete ZEI compound words.'))

stem = Stemmer('english').stemWord


def load_yaml(filename):
    with open(filename) as f:
        return yaml.load(f)


def tex2html(tex):
    """Turn most of the TeX used in jbovlaste into HTML.

    >>> tex2html('$x_1$ is $10^2*2$ examples of $x_{2}$.')
    u'x<sub>1</sub> is 10<sup>2\\xd72</sup> examples of x<sub>2</sub>.'
    >>> tex2html('\emph{This} is emphasised and \\\\textbf{this} is boldfaced.')
    u'<em>This</em> is emphasised and <strong>this</strong> is boldfaced.'
    """