Example #1
0
def search_for_departments(keywords):
    keyw = keywords.split()
    dept_ids = []
    try:
        conn = lite.connect(db)
        conn.row_factory = lite.Row
        with conn:
            for k in keyw:
                k_stemmed = stemmer.stem(k)
                print(k_stemmed)
                cur = conn.cursor()
                cur.execute("SELECT id FROM departments where name like ?",
                            ("%" + k_stemmed + "%", ))
                rows = cur.fetchall()
                if len(rows) > 0: print("found in courses.........")
                for row in rows:
                    dept_ids.append(row[0])
            count_keywords = len(keyw)
            common_ids = {x: dept_ids.count(x) for x in dept_ids}
            out = []
            for x in common_ids:
                if common_ids[x] == count_keywords:
                    cur.execute("SELECT name FROM departments where id = ?",
                                (x, ))
                    rows = cur.fetchone()
                    out.append(str(x) + ":" + rows[0])
            return out
    except lite.Error as e:
        print('error openning table departments', e)
        return []
Example #2
0
def search_for_departments(keywords):
    keyw = keywords.split()
    dept_ids = []
    try:
        conn = lite.connect(db)
        conn.row_factory = lite.Row
        with conn:
            for k in keyw:
                k_stemmed = stemmer.stem(k)
                print( k_stemmed)
                cur = conn.cursor()
                cur.execute("SELECT id FROM departments where name like ?", ("%"+k_stemmed+"%",))
                rows = cur.fetchall()
                if len(rows)>0: print ("found in courses.........")
                for row in rows:
                    dept_ids.append(row[0])
            count_keywords = len(keyw)
            common_ids = {x:dept_ids.count(x) for x in dept_ids}
            out =[]
            for x in common_ids:
                if common_ids[x] == count_keywords:
                    cur.execute("SELECT name FROM departments where id = ?", (x,))
                    rows = cur.fetchone()
                    out.append(str(x)+":"+rows[0])
            return out
    except lite.Error as e:
        print('error openning table departments', e)
        return []
Example #3
0
    def pre_process(self, uid, tokens):
        # adding into active terms before stemming
        self.active_terms.append((self.timestamp, tokens, uid))

        while len(self.active_terms) > 0:
            term = self.active_terms[0]
            if term[0] < self.timestamp - _ACTIVE_WINDOW_SIZE * 60:
                self.active_terms.popleft()
            else:
                break

        # stemming
        tokens = map(lambda x: stemmer.stem(x), tokens)

        if len(tokens) < 1:
            return None

        # hashing
        results = [] # (counts, reserved_slot, n_words, h)

        for h in range(fast_hashing.HASH_NUMBER):
            results.append(({}, {}, len(tokens), h))

        for token in tokens:
            hash_code = np.array(fast_hashing.hash_code(token)) % _SKETCH_BUCKET_SIZE

            for h in range(fast_hashing.HASH_NUMBER):
                code = hash_code[h]
                if code in results[h][0]:
                    results[h][0][code] += 1
                else:
                    results[h][0][code] = 1

        return results
Example #4
0
def checkword(word, words, firstword=False, context=''):
    if word in words or stemmer.stem(word) in words:
        return True
    if len(word) > 25:
        return False
    if firstword and (str(word[0]).upper() + word[1:] in words):
        return True
    print("Found a word that wasn't recognized: ", word, ", in the line: ")
    print(re.sub(word, word.upper(), context), end='')
    print("We're looking for close matches to this word. Please wait...")
    dist = 0
    editdists = [10] * 5
    wordlist = {0: '', 1: '', 2: '', 3: '', 4: ''}
    longest = max(editdists)
    index = 1
    for line in words:
        '''Stuff to make it faster!'''
        '''Match upper/lowercase'''
        if not firstword and word[0].isupper() != line[0].isupper():
            continue
        '''Don't allow words that are too long or too short from the dictionary'''
        if len(line) - 3 > len(word) or len(word) - 3 > len(line):
            continue
        '''Randomly check if some letters are contained in both, 
            only for long enough words. This is the part that really gets the time
            down to the order of seconds, rather than minutes.'''
        if len(word) > 3:
            count = 0
            for x in range(int(len(word) / 4)):
                if word[int(random.random() * len(word))] in line:
                    count += 1
            if count < (int(len(word) / 4)) - int(4 / len(word)):
                continue
        '''If we get through all that, then calculate the min edit distance'''
        #ignore case if it's the first word
        if firstword:
            dist = mineditdist(line.lower(), word.lower())
        else:
            dist = mineditdist(line, word)
        '''Saves the word if it's in the top 5'''
        if dist < longest:
            index = editdists.index(longest)
            editdists.remove(longest)
            wordlist[index] = line
            editdists.insert(index, dist)
            longest = max(editdists)
    '''Ordering the list of words that are the closest to the source word'''
    returnlist = []
    while len(returnlist) < 5:
        for val in editdists:
            if len(returnlist) == 5:
                break
            if val == min(editdists):
                returnlist.append(wordlist.get(editdists.index(val)))
                wordlist.pop(editdists.index(val), 0)
                editdists[editdists.index(val)] = 11

    print(returnlist)
    return returnlist
Example #5
0
  def search(self, query, isPhrase, isOrMatch):
    results = []

    stemmed = [stem(t) for t in query.split(" ")]
    if (isPhrase):
      results = self.phraseSearch(stemmed)
    else:
      results = self.termSearch(stemmed, isOrMatch)

    for doc in self.removeNailPolish(results):
      self.printResult(doc)
def load_text():
    opinions = {}
    with codecs.open('opinie1', 'r', encoding='utf-8') as my_file:
        for line in my_file:
            pair = line.split(";", 2)
            key = pair[1]
            key = split_to_words(key)
            key = stemmer.stem(key)
            key = " ".join(key)
            value = pair[0]
            opinions[key] = float(value)
    return opinions
Example #7
0
def bagOfWords(s, words):
    bag = [0 for _ in range(len(words))]

    s_words = nltk.word_tokenize(s)
    s_words = [stemmer.stem(word.lower()) for word in s_words]

    for se in s_words:
        for i, w in enumerate(words):
            if w == se:
                bag[i] = 1

    return numpy.array(bag)
Example #8
0
    def analyse_topics(self, _probs):
        words = set()
        for term in self.active_terms:
            for word in term[1]:
                words.add(word)
        print "size of words:", len(words)

        high_prob_words = []
        for _word in words:
            word = stemmer.stem(_word)
            hash_code = np.array(fast_hashing.hash_code(word)) % _SKETCH_BUCKET_SIZE
            min_prob_list = []
            for h in range(fast_hashing.HASH_NUMBER):
                prob = _probs[h][hash_code[h]]
                min_prob_list.append(prob)

            min_prob_list.sort()
            min_prob = min_prob_list[1] # !!!
            if min_prob >= _PROBABILITY_THRESHOLD:
                high_prob_words.append((word, min_prob))

        # rescale
        s_prob = sum([p for w, p in high_prob_words])
        high_prob_words = [(w, p/s_prob) for w, p in high_prob_words]

        high_prob_words.sort(key=lambda x: x[1], reverse=True)

        # top 20
        high_prob_words = high_prob_words[:20]

        post_res = postprocessor.process(high_prob_words, self.active_terms)

        if eval(config.get('output', 'debug_info')):
            self.output.write('high_prob_words\n')
            self.output.write(str(high_prob_words)) #debugging
            self.output.write('\npost_res\n')
            self.output.write(str(post_res)) #debugging
            self.output.write('\n')

        flag, word_level_results, _ = post_res
        if flag:
            event = dict()
            event['detection_time'] = str(datetime.utcfromtimestamp(self.timestamp))
            event_words = list()
            for prob_word, word_flag in zip(high_prob_words, word_level_results):
                _word = prob_word[0]
                if word_flag:
                    event_words.append(_word)

            event['key_words'] = event_words

            self.output.write(json.dumps(event))
            self.output.write('\n')
Example #9
0
 def normalizeText(self, text):
     text = text.lower()
     text = re.sub(r'[^0-9a-zA-Z]+', ' ', text)
     articleWords = text.split()
     articleWords = self.removeStopWords(articleWords)
     stemmedWords = []
     for word in articleWords:
         stemmed = stemmer.stem(word)
         # p = stemmer.PorterStemmer()
         # stemmed = p.stemWord(word)
         self.reverseStemHashtable[stemmed] = word
         stemmedWords.append(stemmed)
     return stemmedWords
Example #10
0
def index_stem(id, doc):
    terms = doc.split()

    for term in terms:
        term = term.lower()
        term = clean(term)
        term = stemmer.stem(term, 0, len(term) - 1)

        doc_ids = inverted_index.get(term)
        if doc_ids:
            doc_ids.add(id)
        else:
            inverted_index[term] = set()
            inverted_index[term].add(id)
Example #11
0
def process_word_cloud(word_cloud):
    latin_numbers = "1234567890IIIVXIVIΙΙΙ"
    symbols = "[]():-!?&,'//"
    #create a list of stopwords
    stop_words = []
    with open('greek_stop_words.txt', 'r') as fo:
        for line in fo:
            w = line.strip()
            if w[-1] == ',': w = w[:-1]
            w = greek_to_upper(w)
            if w not in stop_words: stop_words.append(w)
    for x in symbols:
        word_cloud = word_cloud.replace(x, ' ')
    wc = word_cloud.split(' ')
    wc_up = []
    for w in wc:
        wc_up.append(greek_to_upper(w))

    new_wc = []
    for w in wc_up:
        if w not in stop_words and w not in latin_numbers and len(w) > 2:
            new_wc.append(w)
        else:
            print("stop word eliminated:", w)
    print(len(new_wc))
    print(new_wc)
    stems_list = {}
    for w in new_wc:
        st_w = stemmer.stem(w)
        print(w, st_w)
        if st_w not in stems_list: stems_list[st_w] = [w]
        else: stems_list[st_w].append(w)
    for x in stems_list:
        print(x, stems_list[x])
    word_frequencies = {}
    for x in stems_list:
        # find the shortest member of the corresponding list for the stem
        word_frequencies[min(stems_list[x], key=len)] = len(stems_list[x])
    #for x in word_frequencies: print (x.lower(), word_frequencies[x])
    # new_text =""
    # for x in word_frequencies:
    #     new_text += (" "+x)*word_frequencies[x]
    # print(new_text.lower())
    # return new_text.lower()
    print(word_frequencies)
    #input()
    return word_frequencies
Example #12
0
def process_word_cloud(word_cloud):
    latin_numbers = "1234567890IIIVXIVIΙΙΙ"
    symbols ="[]():-!?&,'//"
    #create a list of stopwords
    stop_words = []
    with open('greek_stop_words.txt', 'r') as fo:
        for line in fo:
            w = line.strip()
            if w[-1] == ',': w = w[:-1]
            w = greek_to_upper(w)
            if w not in stop_words: stop_words.append(w)
    for x in symbols:
        word_cloud = word_cloud.replace(x,' ')
    wc = word_cloud.split(' ')
    wc_up=[]
    for w in wc:
        wc_up.append(greek_to_upper(w))

    new_wc = []
    for w in wc_up:
        if w not in stop_words and w not in latin_numbers and len(w)>2: new_wc.append(w)
        else : print ("stop word eliminated:", w)
    print(len(new_wc))
    print(new_wc)
    stems_list ={}
    for w in new_wc:
        st_w = stemmer.stem(w)
        print (w, st_w)
        if st_w not in stems_list: stems_list[st_w]=[w]
        else: stems_list[st_w].append(w)
    for x in stems_list:
        print (x, stems_list[x])
    word_frequencies={}
    for x in stems_list:
        # find the shortest member of the corresponding list for the stem
        word_frequencies[min(stems_list[x], key=len)]= len ( stems_list[x])
    #for x in word_frequencies: print (x.lower(), word_frequencies[x])
    # new_text =""
    # for x in word_frequencies:
    #     new_text += (" "+x)*word_frequencies[x]
    # print(new_text.lower())
    # return new_text.lower()
    print (word_frequencies)
    #input()
    return word_frequencies
Example #13
0
    def preprocessing(self, text):
        """ Replace the unusual character in the text """

        to_replace = [
            '!',
            '#',
            '%',
            '$',
            "'",
            '&',
            ')',
            '(',
            '+',
            '*',
            '-',
            ',',
            '/',
            '.',
            '1',
            '0',
            '3',
            '2',
            '5',
            '4',
            '7',
            '6',
            '9',
            '8',
            ';',
            ':',
            '?',
            '_',
            '^',
        ]
        lowered = text.encode('ascii', 'ignore').lower()
        replacing = lowered
        for char_to_replace in to_replace:
            replacing = replacing.replace(char_to_replace,
                                          ' ' + char_to_replace + ' ')
        stemming = ' '
        splited = replacing.split()
        # return replacing
        return stemming.join([stem(item) for item in splited])
Example #14
0
def main(use_tfidf, opinion_text):
    """Print rating of given text
    """
    opinion = " ".join(stemmer.stem(
        create_vectors.split_to_words(opinion_text.decode('utf-8'))))
    trainset = create_vectors.load_text()
    if use_tfidf:
        keywords_file = 'tfidf_keywords'
    else:
        keywords_file = 'keywords'
    regression_file = 'regr_for_{}'.format(keywords_file)
    with codecs.open(keywords_file, 'r', encoding='utf-8') as kfile:
        keywords = json.load(kfile)
        regr = get_regression_from_file(regression_file)
        rating = get_rating(opinion, regr, trainset, keywords, use_tfidf)
        if rating < 0.0:
            rating = 0.0
        elif rating > 5.0:
            rating = 5.0
        print '{:.2f}'.format(rating)
Example #15
0
def getData(company, amount, datef, datet):

    news_dates, news, news_count = downloadNews(company, amount)
    writeNews(news_dates, news, news_count,
              path + 'news' + sep + '{}.csv'.format(company))
    #news_dates, news, news_count = readNews(path + 'news' + sep + '{}.csv'.format(company))

    stocks_dates, stocks, stocks_count = downloadStock(company, datef, datet)
    writeStock(stocks_dates, stocks, stocks_count,
               path + 'stocks' + sep + '{}.csv'.format(company))
    #stocks_dates, stocks, stocks_count = readStock(path + 'stocks' + sep + '{}.csv'.format(company))

    stems_dates, stems, stems_count = stem(news_dates, news, news_count)
    writeNews(stems_dates, stems, stems_count,
              path + 'stems' + sep + '{}.csv'.format(company))
    #stems_dates, stems, stems_count = readNews(path + 'stems' + sep + '{}.csv'.format(company))

    connections_dates, connections_news, connections_stocks, connections_count = connect(
        stems_dates, stems, stems_count, stocks_dates, stocks, stocks_count)
    writeConnections(connections_dates, connections_news, connections_stocks,
                     connections_count,
                     path + 'connections' + sep + '{}.csv'.format(company))
    def process(self, _ptweet):
        self.timestamp = _ptweet.timestamp
        _tokens = _ptweet.tokens
        tokens = [stemmer.stem(x) for x in _tokens]
        if len(tokens) < 3:
            return None, None
        unique_words = set(tokens)
        unique_word_pairs = set()

        for i in unique_words:
            for j in unique_words - {i}:
                # To us [a, b] = [b, a], and sorting gives us a distinct representation.
                unique_word_pairs.add(tuple(sorted([i, j])))

        max_sig = 0
        max_sig_instance = None
        sig_list = list()

        for token in unique_word_pairs:
            if _SIGNI_TYPE == 's':
                min_instance = []
                scores, codes = self.sig_scorers.get(token, self.timestamp)
                for x in scores:
                    min_instance.append(x.observe(int(self.timestamp), 1.0))
                count, ewma, ewmavar, sig = min(min_instance,
                                                key=lambda x: x[1])
                #                 count, ewma, ewmavar, sig = min([x.observe(int(self.timestamp), 1.0) for x in self.sig_scorers.get(token, self.timestamp)],key=lambda x:x[1])
                if sig > max_sig and ewma > 0:
                    max_sig = sig
                    max_sig_instance = _ptweet.datetime(
                    ), count, ewma, ewmavar, sig, token
                if sig > _SIGNI_THRESHOLD and ewma > 0:
                    sig_list.append(
                        (_ptweet.datetime(), count, ewma, ewmavar, sig, token))
        if max_sig > _SIGNI_THRESHOLD:
            #             print(max_sig_instance)
            return max_sig_instance, sig_list
        return None, None
Example #17
0
def search_stem(tokens):
    prev_doc_ids = set()
    accumulate = or_comp

    for token in tokens:
        token = token.lower()
        token = stemmer.stem(token, 0, len(token) - 1)

        if operators.get(token[0:2]):
            accumulate = operators[token[0:2]]
            #print 'operators', accumulate
            token = token[2:]

        doc_ids = inverted_index.get(token)
        #print token, '=', doc_ids
        if doc_ids:
            doc_ids = accumulate(doc_ids, prev_doc_ids)
            #print accumulate, '=', doc_ids
            prev_doc_ids = set(doc_ids)

    l = list(doc_ids)
    l.sort()
    print '\t', tokens, '-->', l
Example #18
0
    def analyse_topics(self, _probs):
        words = set()
        for term in self.active_terms:
            for word in term[1]:
                words.add(word)
        print "size of words:", len(words)

        high_prob_words = []
        for _word in words:
            word = stemmer.stem(_word)
            hash_code = np.array(fast_hashing.hash_code(word)) % _SKETCH_BUCKET_SIZE
            min_prob_list = []
            for h in range(fast_hashing.HASH_NUMBER):
                prob = _probs[h][hash_code[h]]
                min_prob_list.append(prob)

            min_prob_list.sort()
            min_prob = min_prob_list[1] # !!!
            if min_prob >= _PROBABILITY_THRESHOLD:
                high_prob_words.append((word, min_prob, hash_code))

        high_prob_words.sort(key=lambda x: x[1], reverse=True)
        high_prob_words = high_prob_words[:_MAX_NUMBER_WORDS]

        print high_prob_words

        _kws = list()
        _kps = list()

        post_result = postprocessor.process(high_prob_words, self.active_terms)

        print post_result

        if not post_result[0]:
            return

        _event = dict()
        _id = event_output.getId()
        _event['eid'] = _id
        _event['topicID'] = _id

        _event['info.dtime'] = str(datetime.datetime.utcfromtimestamp(self.timestamp))

        '''
        for high_prob_word in high_prob_words:
            _kws.append(high_prob_word[0])
            _kps.append(high_prob_word[1])'''
        word_level_result = post_result[1]
        for i in range(len(high_prob_words)):
            high_prob_word = high_prob_words[i]
            if word_level_result[i]:
                _kws.append(high_prob_word[0])
                _kps.append(high_prob_word[1])


        _event['info.keywords'] = _kws
        _event['info.probs'] = _kps

        _event['info.numUsers'] = post_result[3]
        _event['info.numGeoUsers'] = 0
        _event['info.numTweets'] = post_result[2]
        _event['info.numGeoTweets'] = 0

        event_output.put(_id, _event)
Example #19
0
if len(sys.argv) < 2:
    print("Invalid argument count")
    sys.exit()
else:
    if len(sys.argv) >= 3:
        bgColor = sys.argv[2]
    if len(sys.argv) == 4:
        maskFile = "py/masks/" + sys.argv[3] + ".jpg"
    outFile = sys.argv[1]
inp = open("py/text/" + outFile + ".txt", encoding="utf-8", mode="r")
text = inp.read()
inp.close()
text = re.sub('[\W_]+', ' ', text)
splits = [x for x in text.split(' ') if (not x.isspace() and x)]
stemsplits = stemmer.stem(" ".join(splits)).split(' ')
dicta = {}
dicts = defaultdict(int)
dictst = defaultdict(int)
dictfull = {}
for x in range(0, len(splits)):
    dicta[splits[x]] = stemsplits[x]
for x in range(0, len(splits)):
    dicts[splits[x]] += 1
for x in range(0, len(stemsplits)):
    dictst[stemsplits[x]] += 1
sorted_d = sorted(dicts.items(), key=operator.itemgetter(1))
brr = len(splits)
for w in range(0, len(sorted_d)):
    if dicta[sorted_d[w][0]] not in dictfull:
        dictfull[dicta[sorted_d[w][0]]] = (sorted_d[w][0],
Example #20
0
            "knew": "knew",
            "knick": "knick",
            "knif": "knif",
            "knife": "knife",
            "knight": "knight",
            "knightly": "knight",
            "knights": "knight",
            "knit": "knit",
            "knits": "knit",
            "knitted": "knit",
            "knitting": "knit",
            "knives": "knive",
            "knob": "knob",
            "knobs": "knob",
            "knock": "knock",
            "knocked": "knock",
            "knocker": "knocker",
            "knockers": "knocker",
            "knocking": "knock",
            "knocks": "knock",
            "knopp": "knopp",
            "knot": "knot",
            "knots": "knot",
}

for original in test_cases:
    stemmed_term = stem(original)
    expected_stem = test_cases[original]
    error_msg = "stemmed %s to %s, expected value %s" % (original, stemmed_term, expected_stem)
    assert expected_stem == stemmed_term,  error_msg
Example #21
0
def checkword(word, words, firstword = False, context = ''):
    if word in words or stemmer.stem(word) in words:
        return True
    if len(word)>25:
        return False
    if firstword and (str(word[0]).upper()+word[1:] in words):
        return True
    print("Found a word that wasn't recognized: ", word, ", in the line: ")
    print(re.sub(word, word.upper(), context), end = '')
    print("We're looking for close matches to this word. Please wait...")
    dist = 0
    editdists = [10]*5
    wordlist = {0:'',1:'',2:'',3:'',4:''}
    longest = max(editdists)
    index = 1
    for line in words:
        '''Stuff to make it faster!'''
        '''Match upper/lowercase'''
        if not firstword and word[0].isupper() != line[0].isupper():
            continue
        '''Don't allow words that are too long or too short from the dictionary'''
        if len(line)-3 > len(word) or len(word)-3 > len(line):
            continue
        '''Randomly check if some letters are contained in both, 
            only for long enough words. This is the part that really gets the time
            down to the order of seconds, rather than minutes.'''
        if len(word) > 3:
            count = 0
            for x in range(int(len(word)/4)):
                if word[int(random.random()*len(word))] in line:
                    count += 1
            if count < (int(len(word)/4)) - int(4/len(word)):
                continue
        
        '''If we get through all that, then calculate the min edit distance'''
        #ignore case if it's the first word
        if firstword:
            dist = mineditdist(line.lower(), word.lower())
        else:
            dist = mineditdist(line, word)
        
        '''Saves the word if it's in the top 5'''
        if dist < longest:
            index = editdists.index(longest)
            editdists.remove(longest)
            wordlist[index] = line
            editdists.insert(index, dist)
            longest = max(editdists) 
    '''Ordering the list of words that are the closest to the source word'''
    returnlist = []
    while len(returnlist)<5:
        for val in editdists:
            if len(returnlist) == 5:
                break
            if val == min(editdists):
                returnlist.append(wordlist.get(editdists.index(val)))
                wordlist.pop(editdists.index(val),0)
                editdists[editdists.index(val)] = 11
            
    print(returnlist)
    return returnlist
import sys

from triples import ParseTriples, Triple
import keyvalue.sqliteKVStore as sqliteKVS
import stemmer as s

imagesStore = sqliteKVS.SqliteKeyValue("images.db")
labelsStore = sqliteKVS.SqliteKeyValue("labels.db")

if (len(sys.argv) < 2):
    print("Es necesario indicar la o las palabras a buscar Ejemplo:")
    print("{0} palabra1".format(sys.argv[0]))

for word in sys.argv[1:]:
    w = s.stem(word)

    newword = labelsStore.getItem(w)

    print(newword)
    #if len(word) > 0:
    #    print(imagesStore.getItem(word[0][0]))
    #@TODO Aqui debemos programar la logica de buscar las URLs
    #asociadas a cada palabra que nos den via la linea de comandos.

imagesStore.close()
labelsStore.close()
            dictionary[image.getSubject()] = image.getObject()

        print(image.getSubject() + " -- " + image.getObject())

    image = imagesDS.getNext()

for key, value in dictionary.items():
    h = 0
    imagsDyna.putItem(key, {"S": value})

for i in range(0, 5000):

    label = labelsDS.getNext()

    #if len(labelsDyna.getItem(label.getSubject())) > 0:
    stemmer = s.stem(label.getObject())
    #Note that label could have mutiple values so iterate thorough the list
    for word in stemmer.split(" "):
        if label.getSubject() in dictionary:
            if word in dictionaryLabels:
                dictionaryLabels[word].append({"S": label.getSubject()})
            else:
                dictionaryLabels[word] = [{"S": label.getSubject()}]
            print(word + " is asociated with " + label.getObject() + " " +
                  label.getSubject())
        #termsStore.putItem(key=word, value=label.getSubject())

for key, value in dictionaryLabels.items():
    h = 0
    labelsDyna.putItem(key, {"L": value})
Example #24
0
def stemmed_words(doc):
    return (mystem.stem(w) for w in analyzer(doc))
Example #25
0
def correctSentence(sentence, index):
    taggedS = tagger.applyMLTag(sentence)
    word = taggedS[index][0]
    POStag = taggedS[index][1]
    stemList = stemmer.stem(word, POStag)
    #remove duplicates
    verifiedWords = []
    for s in stemList:
        #("found stem "+str(s))
        tags = tagger.getTagsForWord(s[0])
        if len(tags) > 0:
            #print("stem added")
            verifiedWords += [s]
#at this point, verifiedWords should contain only real words
    if len(verifiedWords) == 0:
        #print("No verified words.")
        return "No Answer"

    replacementWord = ""
    #print "Entering while loop"
    while (replacementWord == "" and len(verifiedWords) > 0):
        #find the shortest word/root
        root = verifiedWords[0]
        numVerifiedLeft = len(verifiedWords)
        for w in verifiedWords:
            if len(w[0]) <= len(root[0]):
                root = w
#print("shortest word is "+str(root))
#possibles should contain all words that can contain the root
            possibles = tagger.getWordsWithRoot(root[0])
            if (root[0][-1] == 'e'):
                possibles += tagger.getWordsWithRoot(root[0][:-1])
            elif (root[0][-1] == 'y'):
                possibles += tagger.getWordsWithRoot(root[0][:-1] + 'i')
            for row in stemmer.csvReader("irregularPastVerbs.csv"):
                if (row[0] == root[0]):
                    possibles += tagger.getWordsWithRoot(row[1])
                    possibles += tagger.getWordsWithRoot(row[2])
#print("possibles for "+str(root)+" are "+str(possibles))
#actualPossibles should contain all words that can be stemmed to the root
            possibles.sort(key=lambda x: len(x[0]), reverse=False)
            possibles = possibles[:40]
            actualPossibles = []
            for word in possibles:
                if (stemmer.isRootOfWord(root[0], root[1], word[0], word[1])):
                    actualPossibles += [word]
            print("actual possibles for " + str(root) + " are " +
                  str(actualPossibles))
            prevWord = ""
            if index > 0:
                prevWord = sentence[index - 1]
            nextWord = ""
            if index < len(sentence) - 1:
                nextWord = sentence[index + 1]
            replacementWord = MLWordUsingBigrams(prevWord, nextWord,
                                                 actualPossibles)
            #print("replacement word found for root "+str(root)+" is "+replacementWord)
            #						verifiedWords.remove(root)
            numVerifiedLeft -= 1
        if (numVerifiedLeft == 0 and replacementWord == ""):
            #print("No good replacements found. Cry now.")
            return "No Answer"

#print("We highly reccomend that you replace your word with "+replacementWord)
#print("Your sentence would then become:")
        sentence[index] = replacementWord
        newSentence = ""
        for w in sentence:
            newSentence += (w + " ")
        print newSentence
        return sentence
Example #26
0
border = int(connections_count * 0.75)

training_dates = total_dates[:border]
training_news = total_news_sequence[:border]
training_stocks = total_stocks[:border]
training_count = border

testing_dates = total_dates[border:]
testing_news = total_news_sequence[border:]
testing_stocks = total_stocks[border:]
testing_count = total_count - border

total_X = numpy.array(total_news_sequence)
total_y = numpy.array(total_stocks)

training_X = numpy.array(training_news)
training_y = numpy.array(training_stocks)

testing_X = numpy.array(testing_news)
testing_y = numpy.array(testing_stocks)

if sys.argv[1] == '-f':
    fit(company, training_X, training_y, testing_X, testing_y)
else:
    news_dates, news, news_count = readNews(path + predict_path)
    stems_dates, stems, stems_count = stem(news_dates, news, news_count)
    news_sequences = sequence.pad_sequences(
        sequences=tokenizer.texts_to_sequences(stems))
    y = predict(news_sequences, company)
    print(y)