Exemple #1
0
 def visitTextFile(self, textfile):
     splitter = Splitter(textfile.filePath, len(self.workers))
     file_split_result = splitter.split()
     self.operations[textfile.id] = FilePartition(textfile.id,
                                                  len(self.workers),
                                                  file_split_result,
                                                  textfile.filePath)
     self._set_collect_count(textfile)
Exemple #2
0
def processQuestion(gloveModel,
                    question,
                    minLen=1,
                    maxLen=3,
                    useAPI=False,
                    useSynonyms=False):
    tagger = POSTagger()
    pos = tagger.parse(question)
    # create splitter and generalizer
    splitter = Splitter()
    if question[-1] == '?' or question[-1] == '.':
        question = question[:-1]
    gen_question = splitter.generalize(question, pos)
    labels = []
    resultsExists = False
    if not useAPI:
        parts = list(splitter.split(gen_question, min=minLen, max=maxLen))
    else:
        resultsExists = True
        apiResult, _ = api.getBinaryRelations(question)
        parts = [
            rel.predicate for rel in apiResult
            if len(rel.predicate_positions_) > 1
        ]
        for part in parts:
            if len(part.split()) > 1:
                labels.append(part.split()[0] +
                              ''.join(''.join([w[0].upper(), w[1:].lower()])
                                      for w in part.split()[1:]))
        if useSynonyms:
            predicates = [max(part.split(), key=len) for part in parts]
            if predicates is not None and len(predicates) > 0:
                for predicate in predicates:
                    for part in list(parts):
                        if predicate in part:
                            for syn in gloveModel.gloveModel.most_similar(
                                    predicate.lower()):
                                parts.append(part.replace(predicate, syn[0]))
        if len(parts) == 0:
            resultsExists = False
            parts = list(splitter.split(gen_question, min=minLen, max=maxLen))
    # create embedder part
    vectors = []
    for part in parts:
        vectors.append(gloveModel.getVector(part))
    return vectors, parts, pos, gen_question, labels, resultsExists
Exemple #3
0
def process_text(text):
        splitter = Splitter()
        postagger = POSTagger()

        # Split the sentences to words
        splitted_sentences = splitter.split(text)

        # Do Parts of Speech Tagging on the words
        pos_tagged_sentences = postagger.pos_tag(splitted_sentences)

        dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
        return sum_score(dict_tagged_sentences)
""" This splitter attempts to maximize apparent total surplus """
from splitter import Splitter
from splitter import Bid

class SurplusMaximizer(Splitter):
    def score(self, bid, averages):
        return bid.amount - averages[bid.item]

items = ["Room 1", "Room 2", "Room 3"]
bids = [Bid("Room 1", "Joey", 10), Bid("Room 1", "Josh", 15),
        Bid("Room 2", "Joey", 5), Bid("Room 2", "Josh", 0)]
s = Splitter()
print s.split(items, ["Joey", "Josh"], bids)
Exemple #5
0
columns = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male',
       'Age_categories_Missing','Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior']

target_column = 'Survived';

r.train_machine(train[columns], train[target_column]);

holdout = test; 

all_X = train[columns]
all_y = train[target_column]

train_x, test_x, train_y, test_y = sp.split(train[columns], train[target_column]);
# toPrint = sr.get_train()['Age'].describe();
# print(toPrint)
r.train_machine(train_x, train_y);
predictions = r.predict(test_x);
accuracy = mt.model_accuracy(test_y, predictions);

regressor_object = Regressor();
reg = regressor_object.get_regressor();

mt.set_cross_score(reg, all_X, all_y, 10)
mt.sort_score();
scores = mt.get_scores();
cross_accurace = mt.get_mean();

regressor_object_1 = Regressor();
Exemple #6
0
# -*- coding: utf-8 -*-
"""
Created on Sat May 25 10:06:24 2019

@author: Gerardo Cervantes
"""

from splitter import Splitter
#For testing split keys

split_keys = ['{PGUP}', '{BKSP}', '{F4}']

if __name__ == "__main__":
    splitter = Splitter()
    splitter.split('Livesplit', '{pgup}', 0)

class Rule(object):
    def __init__(self):
        self.sentences = []
        self.abbreviation = {}
        self.load_data()
        self.load_abbrv()
        self.normalizer = Normalizer()
        self.splitter = Splitter()
        self.corrector = Filter()
        self.lemmatizer = WordNetLemmatizer()
        self.missing_apostrophe_vocab = [
            'isnt', ' arent', 'wasnt', 'werent', 'wont', 'dont', 'didnt',
            'doesnt', 'couldnt', 'shouldnt', 'hasnt', 'havent', 'hadnt'
        ]
        self.tokenizer_mistake_vocab = [
            'isn', 'aren', 'wasn', 'weren', 'won', 'don', 'didn', 'doesn',
            'couldn', 'shouldn', 'hasn', 'haven', 'hadn'
        ]
        self._norm = joblib.load('model.crfsuite')

    def load_data(self):
        with open('../data/reviews.txt', 'r') as myFile:
            self.sentences = myFile.read().splitlines()

    def load_abbrv(self):
        with open('../data/abbreviation.txt', 'r') as myFile:
            self.abbreviation = {
                i.split('---')[0]: i.split('---')[1]
                for i in myFile.read().splitlines()
            }

    def test2(self, sentence):
        tokens = self.splitter.split(sentence.lower())
        tokens = [j for i in tokens for j in i]
        print(tokens)
        X_test = self.normalizer.sent2features(tokens)
        for i in X_test:
            print(i)

        y_pred = self._norm.predict([X_test])
        print(y_pred)

    def process(self, sentence):
        tokens = self.splitter.split(sentence.lower())
        tokens = [j for i in tokens for j in i]
        X_test = self.normalizer.sent2features(tokens)

        y_pred = self._norm.predict([X_test])

        for i in range(len(y_pred[0])):
            if y_pred[0][i] == 'N':
                o, f = self.correct(tokens[i])
                if f:
                    if tokens[i][0].isupper():
                        tokens[i] = o[0].upper() + o[1:]
                    else:
                        tokens[i] = o
        return tokens

    def test(self):
        tokens = [
            self.splitter.split(i.lower()) for i in self.sentences[1001:5000]
        ]
        tokens = [j for i in tokens for j in i]
        X_test = [self.normalizer.sent2features(i) for i in tokens]

        y_pred = self._norm.predict(X_test)

        stats = {}
        output = []
        for i in range(len(tokens)):
            flag = 0
            for j in range(len(y_pred[i])):
                if y_pred[i][j] == 'N':
                    print(tokens[i][j])
                    o, f = self.correct(tokens[i][j])
                    if f:
                        output.append((tokens[i][j], o))
                    if tokens[i][j] not in stats:
                        stats[tokens[i][j]] = 0
                    stats[tokens[i][j]] += 1
                    flag = 1
            if flag == 1:
                print(' '.join(tokens[i]))
                print(y_pred[i])

        with open('inter_correct_2.txt', 'w') as myFile:
            print('start writing')
            for old, new in output:
                print(old, new)
                myFile.write(old + '\t' + new + '\n')

    def correct(self, term):
        ret = []
        flag = False  #是否被修改了
        for i in term.split('.'):
            # 排除加长词语
            i = i.lower()
            i, res = self.correct_elongated(i)
            if res:
                flag = True
                ret.append(i)
                continue

            # tokenize时有时会把didn't分成didn和't
            if i in self.tokenizer_mistake_vocab:
                ret.append(i)
                continue

            # tokenize时会将"'"囊括其中比如't,不用管
            if "'" in i:
                ret.append(i)
                continue

            # 有些词是用'-'组合而成的,不检查
            if '-' in i:
                ret.append(i)
                continue

            # 解决助动词否定词缺少"'"
            if i in self.missing_apostrophe_vocab:
                i = self.correct_missing_apostrophe(i)
                flag = True
                ret.append(i)
                continue

            # 动词过去时已经被词典囊括,只检查名词单复数和动词的第三人称
            if i in self.normalizer.dct or self.lemmatizer.lemmatize(i, wordnet.NOUN) in self.normalizer.dct \
              or self.lemmatizer.lemmatize(i, wordnet.VERB) in self.normalizer.dct:
                ret.append(i)
                continue

            # 简称简写
            if i in self.abbreviation:
                i = self.abbreviation[i]
                flag = True
                ret.append(i)
                continue

            if i.isalpha():
                res_s = self.corrector.process(i)
                tmp = []
                for res in res_s.split(' '):
                    if res != i and (
                            self.lemmatizer.lemmatize(
                                res, wordnet.NOUN) in self.normalizer.dct
                            or self.lemmatizer.lemmatize(
                                res, wordnet.VERB) in self.normalizer.dct
                            or self.lemmatizer.lemmatize(
                                res, wordnet.ADJ) in self.normalizer.dct
                            or self.lemmatizer.lemmatize(
                                res, wordnet.ADV) in self.normalizer.dct):
                        flag = True
                        tmp.append(res)
                if len(tmp) != 0:
                    ret.append(' '.join(tmp))
                else:
                    ret.append(i)
                continue

            ret.append(i)

        if len(ret) == 2:
            return ret[0] + '. ' + ret[1], flag
        else:
            return ret[0], flag

    def correct_merged_words(self, term):
        if len(term) < 4:
            return term, False
        for i in range(1, len(term)):
            if term[:i] in self.normalizer.dct and term[
                    i:] in self.normalizer.dct:
                return term[:i] + ' ' + term[i:], True
        return term, False

    def correct_missing_apostrophe(self, term):
        res = term[:-1] + "'" + term[-1]
        return res

    def correct_elongated(self, term):
        count = 0
        while True:
            start, end, flag = self.find_elongated_pos(term)
            if count == 0 and not flag:
                return term, False
            elif not flag:
                return term, True
            else:
                cand1 = term.replace(term[start:end + 1], term[start])
                cand2 = term.replace(term[start:end + 1], term[start] * 2)
                # 如果candidate1 在词典里,结果就是candidate1
                if cand1 in self.normalizer.dct:
                    term = cand1
                    continue
                # 如果candidate2 在词典里,结果就是candidate2
                if cand2 in self.normalizer.dct:
                    term = cand2
                    continue
                # 如果都不在,选取candidate1
                term = cand1
            count += 1

    def find_elongated_pos(self, term):
        prev = ''
        start = 0
        ct = 1
        for idx, i in enumerate(term):
            if idx == 0:
                prev = i
                start = idx
            else:
                if i == prev:
                    ct += 1
                    if ct > 2:
                        end = idx
                        while end <= len(term) - 1 and term[end] == i:
                            end += 1
                        end = end - 1
                        return start, end, True
                else:
                    ct = 1
                    start = idx
                prev = i
        return -1, -1, False
class SplitterTest(TestCase):
    def setUp(self):
        self.s = Splitter()

    def test_returns_none_when_loot_is_undivisible_by_number_of_pirates(self):
        self.assertEqual(None,self.s.split([2,3],2))
        
    def test_returns_none_when_there_are_not_enough_gems(self):
        self.assertEqual(None,self.s.split([4,2],3))
        
    def test_returns_none_when_there_is_a_gem_greater_than_share(self):
        self.assertEqual(None,self.s.split([4,2,3],3))
        
    def test_everybody_gets_the_same_kind_of_bucket_when_we_have_only_one_type_of_gem(self):
        self.assertEqual([[2],[2]],self.s.split([2,2],2))
        self.assertEqual([[2],[2],[2]],self.s.split([2,2,2],3))
        self.assertEqual([[2,2],[2,2]],self.s.split([2,2,2,2],2))
        
    def test_everybody_gets_the_same_kind_of_bucket_when_we_have_the_same_set_of_gem_and_pirates(self):
        self.assertEqual([[3,2],[3,2]],self.s.split([3,2,3,2],2))
        self.assertEqual([[3,2],[3,2],[3,2]],self.s.split([3,2,3,2,3,2],3))
        
    def test_everybody_gets_the_same_value_with_a_different_number_of_gems(self):
        self.assertEqual([[3],[2,1]],self.s.split([1,2,3],2))
        self.assertEqual([[3,2,2,2],[3,2,2,2],[3,2,2,2]],self.s.split([3,3,3,2,2,2,2,2,2,2,2,2],3)) #famoso caso da morte
        self.assertEqual([[7],[5,2]],self.s.split([7,5,2],2))
    
    def test_should_not_create_a_bucket_greater_than_share(self):
        self.assertEqual([[13,1],[7,7]],self.s.split([13,7,7,1],2))
        
    def test_should_rollback_when_the_first_decision_doesnt_fit(self):
        self.assertEqual([[7, 2, 2],[3, 3, 3, 2]],self.s.split([7, 3, 3, 3, 2, 2, 2],2))
        
    def test_should_rollback_when_the_second_decision_doesnt_fit_also(self):
        self.assertEqual([[7, 2, 2, 2],[3, 3, 3, 2, 2]],self.s.split([7, 3, 3, 3, 2, 2, 2, 2, 2],2))
        
        
Exemple #9
0
 def visitTextFile(self, textfile):
     splitter = Splitter(textfile.filePath, len(self.workers))
     file_split_result = splitter.split()
     self.operations[textfile.id] = FilePartition(textfile.id,
                                                  len(self.workers), file_split_result, textfile.filePath)
     self._set_collect_count(textfile)