Python stem Examples, stemmer.stem Python Examples

Example #1

0

Show file

File: evdoxos_search.py Project: navouris/evdoxos_

def search_for_departments(keywords):
    keyw = keywords.split()
    dept_ids = []
    try:
        conn = lite.connect(db)
        conn.row_factory = lite.Row
        with conn:
            for k in keyw:
                k_stemmed = stemmer.stem(k)
                print(k_stemmed)
                cur = conn.cursor()
                cur.execute("SELECT id FROM departments where name like ?",
                            ("%" + k_stemmed + "%", ))
                rows = cur.fetchall()
                if len(rows) > 0: print("found in courses.........")
                for row in rows:
                    dept_ids.append(row[0])
            count_keywords = len(keyw)
            common_ids = {x: dept_ids.count(x) for x in dept_ids}
            out = []
            for x in common_ids:
                if common_ids[x] == count_keywords:
                    cur.execute("SELECT name FROM departments where id = ?",
                                (x, ))
                    rows = cur.fetchone()
                    out.append(str(x) + ":" + rows[0])
            return out
    except lite.Error as e:
        print('error openning table departments', e)
        return []

Example #2

0

Show file

File: evdoxos_search.py Project: navouris/evdoxos_

def search_for_departments(keywords):
    keyw = keywords.split()
    dept_ids = []
    try:
        conn = lite.connect(db)
        conn.row_factory = lite.Row
        with conn:
            for k in keyw:
                k_stemmed = stemmer.stem(k)
                print( k_stemmed)
                cur = conn.cursor()
                cur.execute("SELECT id FROM departments where name like ?", ("%"+k_stemmed+"%",))
                rows = cur.fetchall()
                if len(rows)>0: print ("found in courses.........")
                for row in rows:
                    dept_ids.append(row[0])
            count_keywords = len(keyw)
            common_ids = {x:dept_ids.count(x) for x in dept_ids}
            out =[]
            for x in common_ids:
                if common_ids[x] == count_keywords:
                    cur.execute("SELECT name FROM departments where id = ?", (x,))
                    rows = cur.fetchone()
                    out.append(str(x)+":"+rows[0])
            return out
    except lite.Error as e:
        print('error openning table departments', e)
        return []

Example #3

0

Show file

File: topic_sketch.py Project: zhezhe123/topicsketch

    def pre_process(self, uid, tokens):
        # adding into active terms before stemming
        self.active_terms.append((self.timestamp, tokens, uid))

        while len(self.active_terms) > 0:
            term = self.active_terms[0]
            if term[0] < self.timestamp - _ACTIVE_WINDOW_SIZE * 60:
                self.active_terms.popleft()
            else:
                break

        # stemming
        tokens = map(lambda x: stemmer.stem(x), tokens)

        if len(tokens) < 1:
            return None

        # hashing
        results = [] # (counts, reserved_slot, n_words, h)

        for h in range(fast_hashing.HASH_NUMBER):
            results.append(({}, {}, len(tokens), h))

        for token in tokens:
            hash_code = np.array(fast_hashing.hash_code(token)) % _SKETCH_BUCKET_SIZE

            for h in range(fast_hashing.HASH_NUMBER):
                code = hash_code[h]
                if code in results[h][0]:
                    results[h][0][code] += 1
                else:
                    results[h][0][code] = 1

        return results

Example #4

0

Show file

def checkword(word, words, firstword=False, context=''):
    if word in words or stemmer.stem(word) in words:
        return True
    if len(word) > 25:
        return False
    if firstword and (str(word[0]).upper() + word[1:] in words):
        return True
    print("Found a word that wasn't recognized: ", word, ", in the line: ")
    print(re.sub(word, word.upper(), context), end='')
    print("We're looking for close matches to this word. Please wait...")
    dist = 0
    editdists = [10] * 5
    wordlist = {0: '', 1: '', 2: '', 3: '', 4: ''}
    longest = max(editdists)
    index = 1
    for line in words:
        '''Stuff to make it faster!'''
        '''Match upper/lowercase'''
        if not firstword and word[0].isupper() != line[0].isupper():
            continue
        '''Don't allow words that are too long or too short from the dictionary'''
        if len(line) - 3 > len(word) or len(word) - 3 > len(line):
            continue
        '''Randomly check if some letters are contained in both, 
            only for long enough words. This is the part that really gets the time
            down to the order of seconds, rather than minutes.'''
        if len(word) > 3:
            count = 0
            for x in range(int(len(word) / 4)):
                if word[int(random.random() * len(word))] in line:
                    count += 1
            if count < (int(len(word) / 4)) - int(4 / len(word)):
                continue
        '''If we get through all that, then calculate the min edit distance'''
        #ignore case if it's the first word
        if firstword:
            dist = mineditdist(line.lower(), word.lower())
        else:
            dist = mineditdist(line, word)
        '''Saves the word if it's in the top 5'''
        if dist < longest:
            index = editdists.index(longest)
            editdists.remove(longest)
            wordlist[index] = line
            editdists.insert(index, dist)
            longest = max(editdists)
    '''Ordering the list of words that are the closest to the source word'''
    returnlist = []
    while len(returnlist) < 5:
        for val in editdists:
            if len(returnlist) == 5:
                break
            if val == min(editdists):
                returnlist.append(wordlist.get(editdists.index(val)))
                wordlist.pop(editdists.index(val), 0)
                editdists[editdists.index(val)] = 11

    print(returnlist)
    return returnlist

Example #5

0

Show file

File: searcher.py Project: fiono/spotless_mind

  def search(self, query, isPhrase, isOrMatch):
    results = []

    stemmed = [stem(t) for t in query.split(" ")]
    if (isPhrase):
      results = self.phraseSearch(stemmed)
    else:
      results = self.termSearch(stemmed, isOrMatch)

    for doc in self.removeNailPolish(results):
      self.printResult(doc)

Example #6

0

Show file

File: create_vectors.py Project: skorzewska/smartphoneOpinion

def load_text():
    opinions = {}
    with codecs.open('opinie1', 'r', encoding='utf-8') as my_file:
        for line in my_file:
            pair = line.split(";", 2)
            key = pair[1]
            key = split_to_words(key)
            key = stemmer.stem(key)
            key = " ".join(key)
            value = pair[0]
            opinions[key] = float(value)
    return opinions

Example #7

0

Show file

def bagOfWords(s, words):
    bag = [0 for _ in range(len(words))]

    s_words = nltk.word_tokenize(s)
    s_words = [stemmer.stem(word.lower()) for word in s_words]

    for se in s_words:
        for i, w in enumerate(words):
            if w == se:
                bag[i] = 1

    return numpy.array(bag)

Example #8

0

Show file

File: topic_sketch.py Project: zhezhe123/topicsketch

    def analyse_topics(self, _probs):
        words = set()
        for term in self.active_terms:
            for word in term[1]:
                words.add(word)
        print "size of words:", len(words)

        high_prob_words = []
        for _word in words:
            word = stemmer.stem(_word)
            hash_code = np.array(fast_hashing.hash_code(word)) % _SKETCH_BUCKET_SIZE
            min_prob_list = []
            for h in range(fast_hashing.HASH_NUMBER):
                prob = _probs[h][hash_code[h]]
                min_prob_list.append(prob)

            min_prob_list.sort()
            min_prob = min_prob_list[1] # !!!
            if min_prob >= _PROBABILITY_THRESHOLD:
                high_prob_words.append((word, min_prob))

        # rescale
        s_prob = sum([p for w, p in high_prob_words])
        high_prob_words = [(w, p/s_prob) for w, p in high_prob_words]

        high_prob_words.sort(key=lambda x: x[1], reverse=True)

        # top 20
        high_prob_words = high_prob_words[:20]

        post_res = postprocessor.process(high_prob_words, self.active_terms)

        if eval(config.get('output', 'debug_info')):
            self.output.write('high_prob_words\n')
            self.output.write(str(high_prob_words)) #debugging
            self.output.write('\npost_res\n')
            self.output.write(str(post_res)) #debugging
            self.output.write('\n')

        flag, word_level_results, _ = post_res
        if flag:
            event = dict()
            event['detection_time'] = str(datetime.utcfromtimestamp(self.timestamp))
            event_words = list()
            for prob_word, word_flag in zip(high_prob_words, word_level_results):
                _word = prob_word[0]
                if word_flag:
                    event_words.append(_word)

            event['key_words'] = event_words

            self.output.write(json.dumps(event))
            self.output.write('\n')

Example #9

0

Show file

 def normalizeText(self, text):
     text = text.lower()
     text = re.sub(r'[^0-9a-zA-Z]+', ' ', text)
     articleWords = text.split()
     articleWords = self.removeStopWords(articleWords)
     stemmedWords = []
     for word in articleWords:
         stemmed = stemmer.stem(word)
         # p = stemmer.PorterStemmer()
         # stemmed = p.stemWord(word)
         self.reverseStemHashtable[stemmed] = word
         stemmedWords.append(stemmed)
     return stemmedWords

Example #10

0

Show file

def index_stem(id, doc):
    terms = doc.split()

    for term in terms:
        term = term.lower()
        term = clean(term)
        term = stemmer.stem(term, 0, len(term) - 1)

        doc_ids = inverted_index.get(term)
        if doc_ids:
            doc_ids.add(id)
        else:
            inverted_index[term] = set()
            inverted_index[term].add(id)

Example #11

0

Show file

File: evdoxos_search.py Project: navouris/evdoxos_

def process_word_cloud(word_cloud):
    latin_numbers = "1234567890IIIVXIVIΙΙΙ"
    symbols = "[]():-!?&,'//"
    #create a list of stopwords
    stop_words = []
    with open('greek_stop_words.txt', 'r') as fo:
        for line in fo:
            w = line.strip()
            if w[-1] == ',': w = w[:-1]
            w = greek_to_upper(w)
            if w not in stop_words: stop_words.append(w)
    for x in symbols:
        word_cloud = word_cloud.replace(x, ' ')
    wc = word_cloud.split(' ')
    wc_up = []
    for w in wc:
        wc_up.append(greek_to_upper(w))

    new_wc = []
    for w in wc_up:
        if w not in stop_words and w not in latin_numbers and len(w) > 2:
            new_wc.append(w)
        else:
            print("stop word eliminated:", w)
    print(len(new_wc))
    print(new_wc)
    stems_list = {}
    for w in new_wc:
        st_w = stemmer.stem(w)
        print(w, st_w)
        if st_w not in stems_list: stems_list[st_w] = [w]
        else: stems_list[st_w].append(w)
    for x in stems_list:
        print(x, stems_list[x])
    word_frequencies = {}
    for x in stems_list:
        # find the shortest member of the corresponding list for the stem
        word_frequencies[min(stems_list[x], key=len)] = len(stems_list[x])
    #for x in word_frequencies: print (x.lower(), word_frequencies[x])
    # new_text =""
    # for x in word_frequencies:
    #     new_text += (" "+x)*word_frequencies[x]
    # print(new_text.lower())
    # return new_text.lower()
    print(word_frequencies)
    #input()
    return word_frequencies

Example #12

0

Show file

File: evdoxos_search.py Project: navouris/evdoxos_

def process_word_cloud(word_cloud):
    latin_numbers = "1234567890IIIVXIVIΙΙΙ"
    symbols ="[]():-!?&,'//"
    #create a list of stopwords
    stop_words = []
    with open('greek_stop_words.txt', 'r') as fo:
        for line in fo:
            w = line.strip()
            if w[-1] == ',': w = w[:-1]
            w = greek_to_upper(w)
            if w not in stop_words: stop_words.append(w)
    for x in symbols:
        word_cloud = word_cloud.replace(x,' ')
    wc = word_cloud.split(' ')
    wc_up=[]
    for w in wc:
        wc_up.append(greek_to_upper(w))

    new_wc = []
    for w in wc_up:
        if w not in stop_words and w not in latin_numbers and len(w)>2: new_wc.append(w)
        else : print ("stop word eliminated:", w)
    print(len(new_wc))
    print(new_wc)
    stems_list ={}
    for w in new_wc:
        st_w = stemmer.stem(w)
        print (w, st_w)
        if st_w not in stems_list: stems_list[st_w]=[w]
        else: stems_list[st_w].append(w)
    for x in stems_list:
        print (x, stems_list[x])
    word_frequencies={}
    for x in stems_list:
        # find the shortest member of the corresponding list for the stem
        word_frequencies[min(stems_list[x], key=len)]= len ( stems_list[x])
    #for x in word_frequencies: print (x.lower(), word_frequencies[x])
    # new_text =""
    # for x in word_frequencies:
    #     new_text += (" "+x)*word_frequencies[x]
    # print(new_text.lower())
    # return new_text.lower()
    print (word_frequencies)
    #input()
    return word_frequencies

Example #13

0

Show file

File: tr_read.py Project: MyWorkShop/VQA-Key-Matching

    def preprocessing(self, text):
        """ Replace the unusual character in the text """

        to_replace = [
            '!',
            '#',
            '%',
            '$',
            "'",
            '&',
            ')',
            '(',
            '+',
            '*',
            '-',
            ',',
            '/',
            '.',
            '1',
            '0',
            '3',
            '2',
            '5',
            '4',
            '7',
            '6',
            '9',
            '8',
            ';',
            ':',
            '?',
            '_',
            '^',
        ]
        lowered = text.encode('ascii', 'ignore').lower()
        replacing = lowered
        for char_to_replace in to_replace:
            replacing = replacing.replace(char_to_replace,
                                          ' ' + char_to_replace + ' ')
        stemming = ' '
        splited = replacing.split()
        # return replacing
        return stemming.join([stem(item) for item in splited])

Example #14

0

Show file

File: main.py Project: skorzewska/smartphoneOpinion

def main(use_tfidf, opinion_text):
    """Print rating of given text
    """
    opinion = " ".join(stemmer.stem(
        create_vectors.split_to_words(opinion_text.decode('utf-8'))))
    trainset = create_vectors.load_text()
    if use_tfidf:
        keywords_file = 'tfidf_keywords'
    else:
        keywords_file = 'keywords'
    regression_file = 'regr_for_{}'.format(keywords_file)
    with codecs.open(keywords_file, 'r', encoding='utf-8') as kfile:
        keywords = json.load(kfile)
        regr = get_regression_from_file(regression_file)
        rating = get_rating(opinion, regr, trainset, keywords, use_tfidf)
        if rating < 0.0:
            rating = 0.0
        elif rating > 5.0:
            rating = 5.0
        print '{:.2f}'.format(rating)

Example #15

0

Show file

File: main.py Project: mariapashkova/diploma-1

def getData(company, amount, datef, datet):

    news_dates, news, news_count = downloadNews(company, amount)
    writeNews(news_dates, news, news_count,
              path + 'news' + sep + '{}.csv'.format(company))
    #news_dates, news, news_count = readNews(path + 'news' + sep + '{}.csv'.format(company))

    stocks_dates, stocks, stocks_count = downloadStock(company, datef, datet)
    writeStock(stocks_dates, stocks, stocks_count,
               path + 'stocks' + sep + '{}.csv'.format(company))
    #stocks_dates, stocks, stocks_count = readStock(path + 'stocks' + sep + '{}.csv'.format(company))

    stems_dates, stems, stems_count = stem(news_dates, news, news_count)
    writeNews(stems_dates, stems, stems_count,
              path + 'stems' + sep + '{}.csv'.format(company))
    #stems_dates, stems, stems_count = readNews(path + 'stems' + sep + '{}.csv'.format(company))

    connections_dates, connections_news, connections_stocks, connections_count = connect(
        stems_dates, stems, stems_count, stocks_dates, stocks, stocks_count)
    writeConnections(connections_dates, connections_news, connections_stocks,
                     connections_count,
                     path + 'connections' + sep + '{}.csv'.format(company))

Example #16

0

Show file

File: signi_processor.py Project: Lvious/TwitterBurstDetection

    def process(self, _ptweet):
        self.timestamp = _ptweet.timestamp
        _tokens = _ptweet.tokens
        tokens = [stemmer.stem(x) for x in _tokens]
        if len(tokens) < 3:
            return None, None
        unique_words = set(tokens)
        unique_word_pairs = set()

        for i in unique_words:
            for j in unique_words - {i}:
                # To us [a, b] = [b, a], and sorting gives us a distinct representation.
                unique_word_pairs.add(tuple(sorted([i, j])))

        max_sig = 0
        max_sig_instance = None
        sig_list = list()

        for token in unique_word_pairs:
            if _SIGNI_TYPE == 's':
                min_instance = []
                scores, codes = self.sig_scorers.get(token, self.timestamp)
                for x in scores:
                    min_instance.append(x.observe(int(self.timestamp), 1.0))
                count, ewma, ewmavar, sig = min(min_instance,
                                                key=lambda x: x[1])
                #                 count, ewma, ewmavar, sig = min([x.observe(int(self.timestamp), 1.0) for x in self.sig_scorers.get(token, self.timestamp)],key=lambda x:x[1])
                if sig > max_sig and ewma > 0:
                    max_sig = sig
                    max_sig_instance = _ptweet.datetime(
                    ), count, ewma, ewmavar, sig, token
                if sig > _SIGNI_THRESHOLD and ewma > 0:
                    sig_list.append(
                        (_ptweet.datetime(), count, ewma, ewmavar, sig, token))
        if max_sig > _SIGNI_THRESHOLD:
            #             print(max_sig_instance)
            return max_sig_instance, sig_list
        return None, None

Example #17

0

Show file

def search_stem(tokens):
    prev_doc_ids = set()
    accumulate = or_comp

    for token in tokens:
        token = token.lower()
        token = stemmer.stem(token, 0, len(token) - 1)

        if operators.get(token[0:2]):
            accumulate = operators[token[0:2]]
            #print 'operators', accumulate
            token = token[2:]

        doc_ids = inverted_index.get(token)
        #print token, '=', doc_ids
        if doc_ids:
            doc_ids = accumulate(doc_ids, prev_doc_ids)
            #print accumulate, '=', doc_ids
            prev_doc_ids = set(doc_ids)

    l = list(doc_ids)
    l.sort()
    print '\t', tokens, '-->', l

Example #18

0

Show file

    def analyse_topics(self, _probs):
        words = set()
        for term in self.active_terms:
            for word in term[1]:
                words.add(word)
        print "size of words:", len(words)

        high_prob_words = []
        for _word in words:
            word = stemmer.stem(_word)
            hash_code = np.array(fast_hashing.hash_code(word)) % _SKETCH_BUCKET_SIZE
            min_prob_list = []
            for h in range(fast_hashing.HASH_NUMBER):
                prob = _probs[h][hash_code[h]]
                min_prob_list.append(prob)

            min_prob_list.sort()
            min_prob = min_prob_list[1] # !!!
            if min_prob >= _PROBABILITY_THRESHOLD:
                high_prob_words.append((word, min_prob, hash_code))

        high_prob_words.sort(key=lambda x: x[1], reverse=True)
        high_prob_words = high_prob_words[:_MAX_NUMBER_WORDS]

        print high_prob_words

        _kws = list()
        _kps = list()

        post_result = postprocessor.process(high_prob_words, self.active_terms)

        print post_result

        if not post_result[0]:
            return

        _event = dict()
        _id = event_output.getId()
        _event['eid'] = _id
        _event['topicID'] = _id

        _event['info.dtime'] = str(datetime.datetime.utcfromtimestamp(self.timestamp))

        '''
        for high_prob_word in high_prob_words:
            _kws.append(high_prob_word[0])
            _kps.append(high_prob_word[1])'''
        word_level_result = post_result[1]
        for i in range(len(high_prob_words)):
            high_prob_word = high_prob_words[i]
            if word_level_result[i]:
                _kws.append(high_prob_word[0])
                _kps.append(high_prob_word[1])


        _event['info.keywords'] = _kws
        _event['info.probs'] = _kps

        _event['info.numUsers'] = post_result[3]
        _event['info.numGeoUsers'] = 0
        _event['info.numTweets'] = post_result[2]
        _event['info.numGeoTweets'] = 0

        event_output.put(_id, _event)

Example #19

0

Show file

if len(sys.argv) < 2:
    print("Invalid argument count")
    sys.exit()
else:
    if len(sys.argv) >= 3:
        bgColor = sys.argv[2]
    if len(sys.argv) == 4:
        maskFile = "py/masks/" + sys.argv[3] + ".jpg"
    outFile = sys.argv[1]
inp = open("py/text/" + outFile + ".txt", encoding="utf-8", mode="r")
text = inp.read()
inp.close()
text = re.sub('[\W_]+', ' ', text)
splits = [x for x in text.split(' ') if (not x.isspace() and x)]
stemsplits = stemmer.stem(" ".join(splits)).split(' ')
dicta = {}
dicts = defaultdict(int)
dictst = defaultdict(int)
dictfull = {}
for x in range(0, len(splits)):
    dicta[splits[x]] = stemsplits[x]
for x in range(0, len(splits)):
    dicts[splits[x]] += 1
for x in range(0, len(stemsplits)):
    dictst[stemsplits[x]] += 1
sorted_d = sorted(dicts.items(), key=operator.itemgetter(1))
brr = len(splits)
for w in range(0, len(sorted_d)):
    if dicta[sorted_d[w][0]] not in dictfull:
        dictfull[dicta[sorted_d[w][0]]] = (sorted_d[w][0],

Example #20

0

Show file

File: stemmertest.py Project: fiono/spotless_mind

            "knew": "knew",
            "knick": "knick",
            "knif": "knif",
            "knife": "knife",
            "knight": "knight",
            "knightly": "knight",
            "knights": "knight",
            "knit": "knit",
            "knits": "knit",
            "knitted": "knit",
            "knitting": "knit",
            "knives": "knive",
            "knob": "knob",
            "knobs": "knob",
            "knock": "knock",
            "knocked": "knock",
            "knocker": "knocker",
            "knockers": "knocker",
            "knocking": "knock",
            "knocks": "knock",
            "knopp": "knopp",
            "knot": "knot",
            "knots": "knot",
}

for original in test_cases:
    stemmed_term = stem(original)
    expected_stem = test_cases[original]
    error_msg = "stemmed %s to %s, expected value %s" % (original, stemmed_term, expected_stem)
    assert expected_stem == stemmed_term,  error_msg

Example #21

0

Show file

File: spellcheck.py Project: erictwalker18/nlp

def checkword(word, words, firstword = False, context = ''):
    if word in words or stemmer.stem(word) in words:
        return True
    if len(word)>25:
        return False
    if firstword and (str(word[0]).upper()+word[1:] in words):
        return True
    print("Found a word that wasn't recognized: ", word, ", in the line: ")
    print(re.sub(word, word.upper(), context), end = '')
    print("We're looking for close matches to this word. Please wait...")
    dist = 0
    editdists = [10]*5
    wordlist = {0:'',1:'',2:'',3:'',4:''}
    longest = max(editdists)
    index = 1
    for line in words:
        '''Stuff to make it faster!'''
        '''Match upper/lowercase'''
        if not firstword and word[0].isupper() != line[0].isupper():
            continue
        '''Don't allow words that are too long or too short from the dictionary'''
        if len(line)-3 > len(word) or len(word)-3 > len(line):
            continue
        '''Randomly check if some letters are contained in both, 
            only for long enough words. This is the part that really gets the time
            down to the order of seconds, rather than minutes.'''
        if len(word) > 3:
            count = 0
            for x in range(int(len(word)/4)):
                if word[int(random.random()*len(word))] in line:
                    count += 1
            if count < (int(len(word)/4)) - int(4/len(word)):
                continue
        
        '''If we get through all that, then calculate the min edit distance'''
        #ignore case if it's the first word
        if firstword:
            dist = mineditdist(line.lower(), word.lower())
        else:
            dist = mineditdist(line, word)
        
        '''Saves the word if it's in the top 5'''
        if dist < longest:
            index = editdists.index(longest)
            editdists.remove(longest)
            wordlist[index] = line
            editdists.insert(index, dist)
            longest = max(editdists) 
    '''Ordering the list of words that are the closest to the source word'''
    returnlist = []
    while len(returnlist)<5:
        for val in editdists:
            if len(returnlist) == 5:
                break
            if val == min(editdists):
                returnlist.append(wordlist.get(editdists.index(val)))
                wordlist.pop(editdists.index(val),0)
                editdists[editdists.index(val)] = 11
            
    print(returnlist)
    return returnlist

Example #22

0

Show file

File: queryImages.py Project: hectorfloresflores/Image-Search-Engine

import sys

from triples import ParseTriples, Triple
import keyvalue.sqliteKVStore as sqliteKVS
import stemmer as s

imagesStore = sqliteKVS.SqliteKeyValue("images.db")
labelsStore = sqliteKVS.SqliteKeyValue("labels.db")

if (len(sys.argv) < 2):
    print("Es necesario indicar la o las palabras a buscar Ejemplo:")
    print("{0} palabra1".format(sys.argv[0]))

for word in sys.argv[1:]:
    w = s.stem(word)

    newword = labelsStore.getItem(w)

    print(newword)
    #if len(word) > 0:
    #    print(imagesStore.getItem(word[0][0]))
    #@TODO Aqui debemos programar la logica de buscar las URLs
    #asociadas a cada palabra que nos den via la linea de comandos.

imagesStore.close()
labelsStore.close()

Example #23

0

Show file

File: loadImages.py Project: hectorfloresflores/Image-Search-Engine

            dictionary[image.getSubject()] = image.getObject()

        print(image.getSubject() + " -- " + image.getObject())

    image = imagesDS.getNext()

for key, value in dictionary.items():
    h = 0
    imagsDyna.putItem(key, {"S": value})

for i in range(0, 5000):

    label = labelsDS.getNext()

    #if len(labelsDyna.getItem(label.getSubject())) > 0:
    stemmer = s.stem(label.getObject())
    #Note that label could have mutiple values so iterate thorough the list
    for word in stemmer.split(" "):
        if label.getSubject() in dictionary:
            if word in dictionaryLabels:
                dictionaryLabels[word].append({"S": label.getSubject()})
            else:
                dictionaryLabels[word] = [{"S": label.getSubject()}]
            print(word + " is asociated with " + label.getObject() + " " +
                  label.getSubject())
        #termsStore.putItem(key=word, value=label.getSubject())

for key, value in dictionaryLabels.items():
    h = 0
    labelsDyna.putItem(key, {"L": value})

Example #24

0

Show file

File: proj1.py Project: avirajs/ML_Proj1

def stemmed_words(doc):
    return (mystem.stem(w) for w in analyzer(doc))

Example #25

0

Show file

def correctSentence(sentence, index):
    taggedS = tagger.applyMLTag(sentence)
    word = taggedS[index][0]
    POStag = taggedS[index][1]
    stemList = stemmer.stem(word, POStag)
    #remove duplicates
    verifiedWords = []
    for s in stemList:
        #("found stem "+str(s))
        tags = tagger.getTagsForWord(s[0])
        if len(tags) > 0:
            #print("stem added")
            verifiedWords += [s]
#at this point, verifiedWords should contain only real words
    if len(verifiedWords) == 0:
        #print("No verified words.")
        return "No Answer"

    replacementWord = ""
    #print "Entering while loop"
    while (replacementWord == "" and len(verifiedWords) > 0):
        #find the shortest word/root
        root = verifiedWords[0]
        numVerifiedLeft = len(verifiedWords)
        for w in verifiedWords:
            if len(w[0]) <= len(root[0]):
                root = w
#print("shortest word is "+str(root))
#possibles should contain all words that can contain the root
            possibles = tagger.getWordsWithRoot(root[0])
            if (root[0][-1] == 'e'):
                possibles += tagger.getWordsWithRoot(root[0][:-1])
            elif (root[0][-1] == 'y'):
                possibles += tagger.getWordsWithRoot(root[0][:-1] + 'i')
            for row in stemmer.csvReader("irregularPastVerbs.csv"):
                if (row[0] == root[0]):
                    possibles += tagger.getWordsWithRoot(row[1])
                    possibles += tagger.getWordsWithRoot(row[2])
#print("possibles for "+str(root)+" are "+str(possibles))
#actualPossibles should contain all words that can be stemmed to the root
            possibles.sort(key=lambda x: len(x[0]), reverse=False)
            possibles = possibles[:40]
            actualPossibles = []
            for word in possibles:
                if (stemmer.isRootOfWord(root[0], root[1], word[0], word[1])):
                    actualPossibles += [word]
            print("actual possibles for " + str(root) + " are " +
                  str(actualPossibles))
            prevWord = ""
            if index > 0:
                prevWord = sentence[index - 1]
            nextWord = ""
            if index < len(sentence) - 1:
                nextWord = sentence[index + 1]
            replacementWord = MLWordUsingBigrams(prevWord, nextWord,
                                                 actualPossibles)
            #print("replacement word found for root "+str(root)+" is "+replacementWord)
            #						verifiedWords.remove(root)
            numVerifiedLeft -= 1
        if (numVerifiedLeft == 0 and replacementWord == ""):
            #print("No good replacements found. Cry now.")
            return "No Answer"

#print("We highly reccomend that you replace your word with "+replacementWord)
#print("Your sentence would then become:")
        sentence[index] = replacementWord
        newSentence = ""
        for w in sentence:
            newSentence += (w + " ")
        print newSentence
        return sentence

Example #26

0

Show file

File: main.py Project: mariapashkova/diploma-1

border = int(connections_count * 0.75)

training_dates = total_dates[:border]
training_news = total_news_sequence[:border]
training_stocks = total_stocks[:border]
training_count = border

testing_dates = total_dates[border:]
testing_news = total_news_sequence[border:]
testing_stocks = total_stocks[border:]
testing_count = total_count - border

total_X = numpy.array(total_news_sequence)
total_y = numpy.array(total_stocks)

training_X = numpy.array(training_news)
training_y = numpy.array(training_stocks)

testing_X = numpy.array(testing_news)
testing_y = numpy.array(testing_stocks)

if sys.argv[1] == '-f':
    fit(company, training_X, training_y, testing_X, testing_y)
else:
    news_dates, news, news_count = readNews(path + predict_path)
    stems_dates, stems, stems_count = stem(news_dates, news, news_count)
    news_sequences = sequence.pad_sequences(
        sequences=tokenizer.texts_to_sequences(stems))
    y = predict(news_sequences, company)
    print(y)