Beispiel #1
0
def update(weights, predseq, labelseq, postagseq, vecs1, vecs2, sent):
    for i in range(len(predseq)):
        true = labelseq[i]
        pred = predseq[i]
        pos = postagseq[i]
        vec1 = vecs1[i]
        vec2 = vecs2[i]
        if i == 0:
            prev_true = '*'
            prev_pred = '*'
        else:
            prev_true = labelseq[i - 1]
            prev_pred = predseq[i - 1]
        if true != pred:

            true_feats = features.extract(sent[i], true, prev_true, pos, vec1,
                                          vec2)
            for feat in true_feats:
                if feat in weights:
                    weights[feat] += step
            pred_feats = features.extract(sent[i], pred, prev_pred, pos, vec1,
                                          vec2)
            for feat in pred_feats:
                if feat in weights:
                    weights[feat] -= step
    return weights
Beispiel #2
0
def ikProblemToFeatures(ikproblem,featureList):
    """Standard feature extractor for IKProblems"""
    if isinstance(ikproblem,dict):
        #assume it's a JSON object already
        return features.extract(ikproblem,featureList)
    elif isinstance(ikproblem,IKProblem):
        jsonObj = ikproblem.toJson()
        return features.extract(jsonObj,featureList)
    elif isinstance(ikproblem,IKObjective):
        return ikObjectiveToFeatures(ikproblem,featureList)
    else:
        assert hasattr(ikproblem,'__iter__'),"IK problem must either be an IKProblem, single IKObjective, or a list"
        return sum([ikObjectiveToFeatures(o,f) for o,f in zip(ikproblem,featureList)],[])
Beispiel #3
0
    def testPOSBIGRAMS(self):
      text = "I love you."
      results = features.extract(text, [Features.POSBIGRAMS], False)
      result = results[Features.POSBIGRAMS]
      self.assertEqual(1, result['(VBP)-(PRP)'])
      self.assertEqual(1, result['(PRP)-(VBP)'])
      self.assertEqual(1, result['(PRP)-(.)'])

      results = features.extract(text, [Features.POSBIGRAMS], True)
      result = results[Features.POSBIGRAMS]
      self.assertEqual(1/3.0, result['(VBP)-(PRP)'])
      self.assertEqual(1/3.0, result['(PRP)-(VBP)'])
      self.assertEqual(1/3.0, result['(PRP)-(.)'])
      self.assertEqual(1, sum(result.values()))
Beispiel #4
0
    def testPOSUnigrams(self):
      text = "I love you."
      results = features.extract(text, [Features.POSUNIGRAM], False)
      result = results[Features.POSUNIGRAM]
      
      self.assertEqual(1, result['VBP'])
      self.assertEqual(2, result['PRP'])
      self.assertEqual(1, result['.'])

      results = features.extract(text, [Features.POSUNIGRAM], True)
      result = results[Features.POSUNIGRAM]
      self.assertEqual(1/4.0, result['VBP'])
      self.assertEqual(2/4.0, result['PRP'])
      self.assertEqual(1/4.0, result['.'])
      self.assertEqual(1, sum(result.values()))
Beispiel #5
0
def update(weights, predseq, labelseq, sent, postagseq, info, ad):
    for i in range(len(predseq) + 1):
        if i == len(predseq):
            word = ''
            pos = ''
            true = '<STOP>'
            pred = '<STOP>'
        else:
            word = sent[i]
            true = labelseq[i]
            pred = predseq[i]
            pos = postagseq[i]

        if i == 0:
            prev_true = '*'
            prev_pred = '*'
        else:
            prev_true = labelseq[i-1]
            prev_pred = predseq[i-1]

        if true != pred or i == len(predseq) and prev_true != prev_pred:
            true_feats = extract(word, true, prev_true, pos, info)
            pred_feats = extract(word, pred, prev_pred, pos, info)

            up = set(true_feats).difference(set(pred_feats))
            down = set(pred_feats).difference(set(true_feats))
            # ADAGRAD
            for u in up:
                if u in ad:
                    ad[u] += 1
            for d in down:
                if d in ad:
                    ad[d] += 1
            
            for u in up:
                if u in weights:
                    if ad[u] > 0.0:
                        weights[u] += step/math.sqrt(ad[u]) # ADAGRAD
                    else:
                        weights[u] += step # ??
            for d in down:
                if d in weights:
                    if ad[d] > 0.0:
                        weights[d] -= step/math.sqrt(ad[d]) # ADAGRAD
                    else:
                        weights[d] -- step # ??
        
    return weights  
Beispiel #6
0
def test_level2(sourceword,target):
    bestoutfn = "../L2output/{0}.{1}.best".format(sourceword, target)
    oofoutfn = "../L2output/{0}.{1}.oof".format(sourceword, target)
    bestoutfile = open(bestoutfn,'w')
    oofoutfile = open(oofoutfn,'w')

    level2_classifier = util_run_experiment.get_pickled_classifier(sourceword,target,'level2')
    frd1,frd2,frd3,frd4 = sorted(list(get_four_friends(target)))   ##Need 4 more features from level1.
    classfrd1,classfrd2,classfrd3,classfrd4 = get_level1_classifiers(frd1,frd2,frd3,frd4,sourceword)
    # finaldir = "../trialdata/alltrials/"
    finaldir = "../finaltest"
    problems = util_run_experiment.get_test_instances(finaldir, sourceword)    

    
    for problem in problems:
        level1_features = features.extract(problem)
        answer_frd1 = classfrd1.classify(level1_features)
        answer_frd2 = classfrd2.classify(level1_features)
        answer_frd3 = classfrd3.classify(level1_features)
        answer_frd4 = classfrd4.classify(level1_features)
        level2_features = train_extracted_level2.extend_features(level1_features,(answer_frd1,answer_frd2,answer_frd3,answer_frd4),frd1,frd2,frd3,frd4)
        level2_answer = level2_classifier.classify(level2_features)
        level2_dist = level2_classifier.prob_classify(level2_features)
        oof_answers = util_run_experiment.topfive(level2_dist)
        print(output_one_best(problem, target, level2_answer), file=bestoutfile)
        print(output_five_best(problem, target, oof_answers),
              file=oofoutfile)
def main():
    parser = argparse.ArgumentParser(description='clwsd')
    parser.add_argument('--sourceword', type=str, nargs=1, required=True)
    parser.add_argument('--targetlang', type=str, nargs=1, required=True)
    parser.add_argument('--classifier', type=str, nargs=1, required=False)
    args = parser.parse_args()

    all_target_languages = "de es fr it nl".split()
    assert args.targetlang[0] in all_target_languages
    target = args.targetlang[0]
    sourceword = args.sourceword[0]
    nltk.classify.megam.config_megam(bin='/usr/local/bin/megam')
    classifier = get_maxent_classifier(sourceword, target)

    fn = "../trialdata/alltrials/{0}.data".format(sourceword)
    ## XXX(alexr): fix later.
    stanford.taggerhome = "/home/alex/software/stanford-postagger-2012-11-11"
    problems = extract_wsd_problems(fn)
    gold_answers = read_gold.get_gold_answers(sourceword, target)
    for problem in problems:
        featureset = features.extract(problem)
        answer = classifier.classify(featureset)
        print(problem.tokenized)
        print(answer)
        label = gold_answers[problem.instance_id]
        print("CORRECT" if label == answer else "WRONG", end=" ")
        print("should be:", label)
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser(description='clwsd')
    parser.add_argument('--sourceword', type=str, required=True)
    parser.add_argument('--targetlang', type=str, required=True)
    parser.add_argument('--taggerhome', type=str, required=True)
    args = parser.parse_args()

    all_target_languages = "de es fr it nl".split()
    assert args.targetlang in all_target_languages
    target = args.targetlang
    sourceword = args.sourceword
    stanford.taggerhome = args.taggerhome

    gold_answers = read_gold.get_gold_answers(sourceword, target)
    instances = get_training_data(sourceword, target)
    print("... training ...")
    nltk.classify.megam.config_megam(bin='/usr/local/bin/megam')
    classifier = MaxentClassifier.train(instances, trace=0, algorithm='megam')
    print("LABELS", classifier.labels())

    ## with open("../eval/{0}.output".format(sourceword), "w") as outfile:
    fn = "../trialdata/alltrials/{0}.data".format(sourceword)
    problems = extract_wsd_problems(fn)
    for problem in problems:
        featureset = features.extract(problem)
        answer = classifier.classify(featureset)
        print(output_one_best(problem, target, answer))
        label = gold_answers[problem.instance_id]
        print("CORRECT" if label == answer else "WRONG")
        print("distribution was...")
        dist = classifier.prob_classify(featureset)
        for key in dist.samples():
            print(" ", key, dist.prob(key))
Beispiel #9
0
def main():
    with open('model.pickle') as f:
        vectorizer, model = pickle.load(f)

    for hyp1, hyp2, ref in sentences():
        f1 = features.extract(ref, hyp1)
        f2 = features.extract(ref, hyp2)
        f = features.diff(f2, f1)
        if f['min_match'] == f['min_match2'] == f['min_match3'] == 0:
            print 0
            continue
        score = model.predict(vectorizer.transform((f,))) # w . (f_2 - f_1)
        if score > 0:
            print 1
        else:
            print -1
Beispiel #10
0
def mrf_optimize(problem):
    """Build the MRF and do the optimization!!"""
    featureset = features.extract(problem)

    for lang in all_target_languages:
        classifier = classifiers[lang]
        unary = unary_penalty_table(classifier, featureset)
        print(unary)
        langnode = Node(lang, unary)

    ## get all the combinations of nodes.
    langpairs = list(itertools.combinations(all_target_languages, 2))

    ## create an edge for each language pair.
    for l1, l2 in langpairs:
        print("building edges for", l1, l2)
        edgepotentials = {}
        for val1 in possible_values(l1):
            for val2 in possible_values(l2):
                cooccurrence = cooccurrences[(l1,l2)]
                joint = cooccurrence.lookup_joint(l1, val1, l2, val2)
                # negative logprob of the joint probability. Definitely the best
                # edge potential, for sure.
                edgepotentials[(val1,val2)] = -math.log(joint, 2)
        Edge(l1, l2, edgepotentials)

    ## XXX how many iterations?
    print("mrf solving!!!")
    answers, oof_answers = beliefprop(10)
    print("mrf solved!!!")
    return answers, oof_answers
def get_training_data_from_extracted(sourceword, targetlang):
    """Return a list of (featureset, label) for training."""
    out = []
    problems = []
    fn = "../trainingdata/{0}.{1}.train".format(sourceword, targetlang)

    with open(fn) as infile:
        lines = infile.readlines()
        lines = [line.strip() for line in lines]
        contexts = [line for line in lines[0::3]]
        indices = [int(line) for line in lines[1::3]]
        labelss = [line.split(",") for line in lines[2::3]]
        assert len(contexts) == len(labelss) == len(indices)

    answers = []
    for context, index, labels in zip(contexts, indices, labelss):
        problem = WSDProblem(sourceword, context,
                             testset=False, head_index=index)
        for label in labels:
            if label == '': continue
            problems.append(problem)
            answers.append(label)

    for problem,answer in zip(problems, answers):
        featureset = features.extract(problem)
        label = answer
        assert(type(label) is str)
        out.append((featureset, label))
        #print("###the features are: \n{}".format(featureset))
        #input()
    return out
Beispiel #12
0
def extract_all_features(formatted_corpus):
    sent_cnt = 0

    y_symbols = []  # Our array of transistions
    X_dict = list()  # Our matrix

    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            a = 1
            # print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'

        while queue:
            x = features.extract(stack, queue, graph, FEATURE_NAMES, sentence)
            X_dict.append(x)

            stack, queue, graph, trans = reference(stack, queue, graph)

            y_symbols.append(trans)
        stack, graph = transition.empty_stack(stack, graph)

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
        # print(y_symbols)
        # print(graph)

    return X_dict, y_symbols
def read_dataset(filename):
    X, y = [], []
    points = features.extract(filename)
    for f, v in points:
        X.append(f.todict())
        y.append(v)
    return X, y
def getTrainDS():
    root = './data'
    X = np.mat([0]*(len(markerList)))
    y = np.array([])

    for lang in langs:
        for scDir in os.listdir(os.path.join(root, lang)):
            sc = getSC(os.path.join(root, lang, scDir))

            feature = extract(sc)
            feature = np.asmatrix(feature)

            X = np.append(X, feature, axis=0)
            X = np.asmatrix(X)
            y = np.append(y, [lang])

            print "[SUCCESS] Extracted", scDir

            del feature
    X = X[1:]

    print "Shuffling datasets."
    X, y = shuffle(X, y)
    print "[SUCCES] Shuffled datasets"

    print "Splitting and saving datasets."
    ds = splitDS(X, y)
    print "[SUCCESS] Splitted datasets."
    saveDS(ds)
    print "[SUCCESS] Saved datasets."

    del X, y
Beispiel #15
0
    def testLetterUnigram(self):
      text = "ekin"
      results = features.extract(text,[Features.LETTERUNIGRAM],False)
      result = results[Features.LETTERUNIGRAM]
      self.assertEqual(1, result[ord('e')-ord('a')])
      self.assertEqual(1, result[ord('k')-ord('a')])
      self.assertEqual(1, result[ord('i')-ord('a')])
      self.assertEqual(1, result[ord('n')-ord('a')])
      self.assertEqual(4, sum(result))
      results = features.extract(text,[Features.LETTERUNIGRAM],True)
      result = results[Features.LETTERUNIGRAM]
      self.assertEqual(1/4.0, result[ord('e')-ord('a')])
      self.assertEqual(1/4.0, result[ord('k')-ord('a')])
      self.assertEqual(1/4.0, result[ord('i')-ord('a')])
      self.assertEqual(1/4.0, result[ord('n')-ord('a')])
      self.assertEqual(4/4.0, sum(result))

      text = ""
      results = features.extract(text,[Features.LETTERUNIGRAM],False)
      result = results[Features.LETTERUNIGRAM]
      self.assertEqual(0, sum(result))
      results = features.extract(text,[Features.LETTERUNIGRAM],True)
      result = results[Features.LETTERUNIGRAM]
      self.assertEqual(0, sum(result))

      text = "eee"
      results = features.extract(text,[Features.LETTERUNIGRAM],False)
      result = results[Features.LETTERUNIGRAM]
      self.assertEqual(3, result[ord('e')-ord('a')])
      self.assertEqual(3, sum(result))
      results = features.extract(text,[Features.LETTERUNIGRAM],True)
      result = results[Features.LETTERUNIGRAM]
      self.assertEqual(1.0, result[ord('e')-ord('a')])
      self.assertEqual(1.0, sum(result))

      text = "ee k ee"
      results = features.extract(text,[Features.LETTERUNIGRAM],False)
      result = results[Features.LETTERUNIGRAM]
      self.assertEqual(4, result[ord('e')-ord('a')])
      self.assertEqual(1, result[ord('k')-ord('a')])
      self.assertEqual(5, sum(result))
      results = features.extract(text,[Features.LETTERUNIGRAM],True)
      result = results[Features.LETTERUNIGRAM]
      self.assertEqual(4/5.0, result[ord('e')-ord('a')])
      self.assertEqual(1/5.0, result[ord('k')-ord('a')])
      self.assertEqual(5/5.0, sum(result))    
      
      text = "\nn"
      results = features.extract(text, [Features.LETTERUNIGRAM], False)
      result = results[Features.LETTERUNIGRAM]
      self.assertEqual(1.0, result[ord('n')-ord('a')])
      self.assertEqual(1.0, sum(result))
Beispiel #16
0
def main(args: argparse.Namespace) -> None:
    with open(args.data, "r") as src:
        with open(args.features, "w") as out_file:
            for line in src:
                line = word_tokenize(line.replace(":", "_"))
                feature_list = features.extract(line)
                for feature in feature_list:
                    print("\t".join(feature), file=out_file)
                print("", file=out_file)
Beispiel #17
0
def get_score(word, current_tag, prev_tag, postag, weights, info):
    score = 0.0
    features_list = extract(word, current_tag, prev_tag, postag, info)

    for feature in features_list:
        if feature in weights:
            score += weights[feature]

    return score
Beispiel #18
0
def get_score(word, current_tag, prev_tag, pos, v1, v2, weights):
    score = 0.0
    features_list = extract(word, current_tag, prev_tag, pos, v1, v2)

    for feature in features_list:
        if feature in weights:
            score += weights[feature]

    return score, features_list
Beispiel #19
0
def get_score(word, current_tag, prev_tag, pos, v1, v2, weights):
    score = 0.0
    features_list = extract(word, current_tag, prev_tag, pos, v1, v2)

    for feature in features_list:
        if feature in weights:
            score += weights[feature]

    return score, features_list
Beispiel #20
0
def playing(colrow, g):  # takes a board position and adds a players move
    col, row = colrow[0], colrow[1:3]
    row = BOARD - int(row)
    col = alphabet.index(col)
    tmp = g[0].copy()
    g[0] = g[1]
    g[1] = tmp
    g = features.extract(g, row, col, BOT)
    return prediction(g)
Beispiel #21
0
def ikProblemToFeatures(ikproblem, featureList):
    """Standard feature extractor for IKProblems"""
    if isinstance(ikproblem, dict):
        #assume it's a JSON object already
        return features.extract(ikproblem, featureList)
    elif isinstance(ikproblem, IKProblem):
        jsonObj = ikproblem.toJson()
        return features.extract(jsonObj, featureList)
    elif isinstance(ikproblem, IKObjective):
        return ikObjectiveToFeatures(ikproblem, featureList)
    else:
        assert hasattr(
            ikproblem, '__iter__'
        ), "IK problem must either be an IKProblem, single IKObjective, or a list"
        return sum([
            ikObjectiveToFeatures(o, f)
            for o, f in zip(ikproblem, featureList)
        ], [])
Beispiel #22
0
def read_dataset(filename, use_text_features):
    if use_text_features:
        mask = set(['meta', 'text'])
    else:
        mask = set(['meta'])

    X, y = [], []
    points = list(features.filter(features.extract(filename), mask))
    for f, v in points:
        X.append(f.todict())
        y.append(v)
    return X, y
def get_training_data_from_extracted(sourceword, targetlang):
    """Return a list of (featureset, label) for training."""

    frd1, frd2, frd3, frd4 = sorted(list(get_four_friends(targetlang)))  ##Get other four languages.
    ##Get the intersection of four training sentences.
    tool_class = Occurrence(sourceword, frd1, frd2)
    intersection = tool_class.get_common_four_sents(sourceword, frd1, frd2, frd3, frd4)

    out = []
    problems = []
    fn = "../trainingdata/{0}.{1}.train".format(sourceword, targetlang)

    with open(fn) as infile:
        lines = infile.readlines()
        lines = [line.strip() for line in lines]
        contexts = [line for line in lines[0::3]]
        indices = [int(line) for line in lines[1::3]]
        labelss = [line.split(",") for line in lines[2::3]]
        assert len(contexts) == len(labelss) == len(indices)

    print("the length of them...", len(contexts), len(indices), len(labelss))
    # input()
    answers = []
    extention = []
    for context, index, labels in zip(contexts, indices, labelss):
        sentence_id = context + "####" + str(index)
        if (
            sentence_id in intersection
        ):  ##If this sentence also appears in 4 other languages, we can use more features...
            problem = WSDProblem(sourceword, context, testset=False, head_index=index)

            more_featuress = intersection[sentence_id]
            # print(more_featuress)
            for more_feature in more_featuress:
                for label in labels:
                    if label == "":
                        continue
                    problems.append(problem)
                    # more_features = intersection[context]
                    extention.append(more_feature)
                    answers.append(label)
    print("###intersection for five languages....{}\n".format(len(extention)))

    for problem, answer, more_feature in zip(problems, answers, extention):
        featureset = features.extract(problem)
        featureset = extend_features(featureset, more_feature, frd1, frd2, frd3, frd4)
        label = answer
        assert type(label) is str
        # print("=================@@@@features {}\n@@@@label{}\n".format(featureset,label))
        out.append((featureset, label))
    print("###Length of the output should be the same{}\n".format(len(out)))
    return out
Beispiel #24
0
def extract_all_train_features(sents, tagseqs, postagseqs, info):
    featset = set([])
    i = 0
    for sent in sents:
        sys.stderr.write(str(i) + "\r")
        j = 0
        for word in sent:
            tag = tagseqs[i][j]
            postag = postagseqs[i][j]
            if j == 0: # first position
               prev = '*'
            else:
               prev = tagseqs[i][j-1]
            featset.update(extract(word, tag, prev, postag, info)) # get a list of all features possible
            j += 1
        # features for the last label
        featset.update(extract('', '<STOP>', tag, '', info))
        i += 1
    featlist = list(featset)
    for f in featlist:
        print f
    return featlist
Beispiel #25
0
 def extract_features(self):
     """
     This method is used to extract features (A,B,D).
     """
     returnVars = features.extract(self.original_image, self.contour_mask,
                                   self.contour)
     if len(returnVars) == 0:
         self.feature_set = returnVars
     else:
         self.feature_set = returnVars[0]
         self.asymmetry_horizontal = returnVars[1]
         self.asymmetry_vertical = returnVars[2]
         self.warp_img_segmented = returnVars[3]
Beispiel #26
0
def get_training_data(sourceword, target):
    """Return a list of (featureset, label) for training."""
    out = []
    ## map from id to labels
    gold_answers = read_gold.get_gold_answers(sourceword, target)
    problems = get_training_problems(sourceword)

    ## now collate them.
    for problem in problems:
        theid = problem.instance_id
        featureset = features.extract(problem)
        label = gold_answers[theid]
        out.append((featureset, label))
    return out
Beispiel #27
0
def convert_to_features(data_part) -> np.ndarray:
    """This converts a given group to features"""
    current_features = Features()

    feature_list = list()
    last_features = current_features
    for row in data_part.itertuples():
        row_data = Row(row)
        current_features.update(row_data)
        feature_list.append(
            features.extract(row_data, current_features, last_features))
        last_features = current_features.snapshot()

    return np.array(feature_list)
def get_training_data_from_extracted(sourceword, targetlang):
    """Return a list of (featureset, label) for training."""


    frd1,frd2,frd3,frd4 = sorted(list(get_four_friends(targetlang)))  ##Get other four languages.
    classfrd1,classfrd2,classfrd3,classfrd4 = get_level1_classifiers(frd1,frd2,frd3,frd4,sourceword)

    ##Get the intersection of four training sentences.
    tool_class = Occurrence(sourceword,frd1,frd2)

    out = []
    problems = []
    fn = "../trainingdata/{0}.{1}.train".format(sourceword, targetlang)

    with open(fn) as infile:
        lines = infile.readlines()
        lines = [line.strip() for line in lines]
        contexts = [line for line in lines[0::3]]
        indices = [int(line) for line in lines[1::3]]
        labelss = [line.split(",") for line in lines[2::3]]
        assert len(contexts) == len(labelss) == len(indices)

    print("the length of them...",len(contexts),len(indices),len(labelss))
    #input()
    answers = []
    for context, index, labels in zip(contexts, indices, labelss):
        problem = WSDProblem(sourceword, context,
                             testset=False, head_index=index)

            #print(more_featuress)
        for label in labels:
            if label == '': continue
            problems.append(problem)
            #more_features = intersection[context]
            answers.append(label)

    for problem,answer in zip(problems, answers):
        level1_features = features.extract(problem)
        answer_frd1 = classfrd1.classify(level1_features)
        answer_frd2 = classfrd2.classify(level1_features)
        answer_frd3 = classfrd3.classify(level1_features)
        answer_frd4 = classfrd4.classify(level1_features)
        level2_features = extend_features(level1_features,(answer_frd1,answer_frd2,answer_frd3,answer_frd4),frd1,frd2,frd3,frd4)
        label = answer
        assert(type(label) is str)
        #print("=================@@@@features {}\n@@@@label{}\n".format(featureset,label))
        out.append((level2_features, label))
    print("###Length of the output should be the same{}\n".format(len(out)))
    return out
Beispiel #29
0
def main():
    print(os.getcwd())
    print("Create Dataset")
    signal, y = dataset()
    df_y = pd.DataFrame(data=y, columns=['genre'])

    # construct features
    print("Feature Extraction")
    df_x = pd.DataFrame()
    for i in range(0, len(signal)):
        print("number " + str(i))
        new_x = pd.DataFrame(features.extract(signal[i]), index=[i])
        df_x = df_x.append(new_x)

    saveFeature(df_x, df_y)
Beispiel #30
0
def disambiguate_words(words):
    """Given a list of words/lemmas, return a list of disambiguation answers for
    them."""
    classifiers = [classifier_for(word, nonnull=True) for word in words]
    answers = []
    for i in range(len(words)):
        faketagged = [(w,None) for w in words]
        feat = features.extract(faketagged, i)
        classif = classifiers[i]
        ans = classif.classify(feat)
        if ans == UNTRANSLATED:
            ans = mfs_translation(words[i])
            print("MFS!!!", words[i], "==>", ans)
        answers.append(ans)
    return [str(ans) for ans in answers]
Beispiel #31
0
def main():
    setup_logging()
    for fn in sys.argv[1:] or ["input.png"]:
        im = cv.LoadImage(fn)
        fts = extract(im)

        pfn = fn + "-features.dat"
        info("Storing feature pickle in %s", pfn)
        dump(fts, file(pfn, "wb"))

        for l, layer in enumerate(fts):
            for fname, fval in layer.items():
                ffn = "%s-feat-%d-%s.png" % (fn, l, fname)
                info("Rendering feature %s", ffn)
                mat2pil(fval).save(ffn)
Beispiel #32
0
def update(weights, predseq, labelseq, postagseq, vecs1, vecs2, sent):
    for i in range(len(predseq)):
        true = labelseq[i]
        pred = predseq[i]
        pos = postagseq[i]
        vec1 = vecs1[i]
        vec2 = vecs2[i]
        if i == 0:
            prev_true = '*'
            prev_pred = '*'
        else:
            prev_true = labelseq[i-1]
            prev_pred = predseq[i-1]
        if true != pred:
            
            true_feats = features.extract(sent[i], true, prev_true, pos, vec1, vec2)
            for feat in true_feats:
                if feat in weights:
                    weights[feat] += step
            pred_feats = features.extract(sent[i], pred, prev_pred, pos, vec1, vec2)
            for feat in pred_feats:
                if feat in weights:
                    weights[feat] -= step
    return weights  
Beispiel #33
0
def prediction(gamemove, col):
    predictmove = gamemove[None, :]
    resultmatrix = model.predict(predictmove)
    resultmatrix = np.reshape(resultmatrix, [BOARD, BOARD])
    result = np.unravel_index(resultmatrix.argmax(), resultmatrix.shape)
    while gamemove[4][result[0]][result[1]] == 0:  # if move is illegal
        resultmatrix[result[0]][result[1]] = 0  # delete from softmax
        result = np.unravel_index(resultmatrix.argmax(),
                                  resultmatrix.shape)  #find second best option
    resultmatrix = np.round_(resultmatrix, 1) * 10
    print(alphabet[result[1] - BOARD], BOARD - result[0])
    tmp = gamemove[0].copy()
    gamemove[0] = gamemove[1]
    gamemove[1] = tmp
    gamemove = features.extract(gamemove, result[0], result[1], col)
    return gamemove, resultmatrix
Beispiel #34
0
def extract_features(formatted_corpus, feature_names, training=True, model=None):
    non_proj = []

    X_1 = []
    y_1 = []

    sent_cnt = 0
    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []
        feats = []
        while queue:
            feats.append(features.extract(stack, queue, graph, feature_names, sentence))
            stack, queue, graph, trans = reference(stack, queue, graph)
            transitions.append(trans)
        stack, graph = transition.empty_stack(stack, graph)
        X_1.extend(feats)
        y_1.extend(transitions)
        #print('Equal graphs:', transition.equal_graphs(sentence, graph))
        if not transition.equal_graphs(sentence, graph):
            non_proj.append(sentence)

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
        #print(transitions)
        #print(graph)

    #print(len(non_proj))
    #s = sorted(non_proj, key=lambda x: len(x))

    #print([x['form'] for x in s[0]])

    #for x in non_proj:
    #    print(len(x))
    #    print(x)

    return (X_1, y_1)
Beispiel #35
0
def prob_disambiguate_words(words):
    """Given a list of words/lemmas, return a list of disambiguation answers for
    them -- return a list of lists, where each sublist is ordered in decreasing
    probability."""
    classifiers = [classifier_for(word, nonnull=True) for word in words]
    answers = []
    for i in range(len(words)):
        faketagged = [(w,None) for w in words]
        feat = features.extract(faketagged, i)
        classif = classifiers[i]

        ## get all possible options, sorted in wrong order
        dist = classif.prob_classify(feat)
        options = [(dist.prob(samp), samp) for samp in dist.samples()]
        options.sort(reverse=True)
        myanswers = [str(lex) for (prob, lex) in options
                              if prob > 0.01 ]
        print(myanswers)
        answers.append(myanswers)
    return answers
def test(file_name):

    #    file_name="combined_nomeal.csv"
    with open("Guassian_model.pkl", 'rb') as file:
        Guassian_model = pickle.load(file)
    with open("pca_model.pkl", 'rb') as file:
        pca = pickle.load(file)

    test_data = pd.read_csv(file_name, header=None)
    print("---")
    fm = extract(test_data)
    print("---")

    sc = StandardScaler()
    test_data_set = sc.fit_transform(fm)
    pca_dataset = pca.fit_transform(test_data_set)
    gaussianNB_pred = Guassian_model.predict(pca_dataset)
    print("Classes of your given test  " + str(gaussianNB_pred))
    np.savetxt("output.csv", gaussianNB_pred, delimiter=",", fmt='%d')
    print("predicted class labels are saved in output.csv file")
Beispiel #37
0
def perform_prediction(split_test_sentences, feature_names, classifier, model):
    print("Extracting features from test Corpus")
    X_matrix_test = []  # The matrix of feature vectors
    Y_vec_test = []  # The vector of predicted outputs
    # We iterate over all sentences that are in the corpus
    for sentence in split_test_sentences:
        stack = []  # Stack of tokens that will be manipulated with shift, reduce, left-arc, and right-arc
        queue = list(sentence)  # Queue of input tokens that will be cycles through
        graph = {}  # The dependency graph that we will go store our dependency arcs in
        graph['heads'] = {}  # Create a dictionary inside graph for heads of each sentence
        graph['heads']['0'] = '0'  # The first element in the heads dictionary inside the graph dictionary is '0'
        graph['deprels'] = {}  # Define another dictionary with dependency relations stored inside, again inside graph
        graph['deprels']['0'] = 'ROOT'  # Make the first element in the deprel dictionary the 'ROOT' keyword
        while queue:
            x_elem = features.extract(stack, queue, graph, feature_names, sentence)
            x_predictable = features.vec.transform(x_elem)
            # X_matrix_test.extend(x_elem)
            # stack, queue, graph, y_elem = dparser.reference(stack, queue, graph)
            # WE DO NOT USE Y_VEC_TEST
            # Y_vec_test.append(y_elem)
            predicted_transition = classifier.predict(x_predictable)
            stack, queue, graph, transition = parse_ml(stack, queue, graph, predicted_transition)
Beispiel #38
0
def extract_features(csv_path, csv_file):
    data = []
    f_temp = []
    with open(csv_path, "r") as data_file: #open the files
        for row in data_file:
            strsplit = row.split(',')
            strsplit = list(map(float, strsplit))
            data.append(strsplit)

    no_of_data_per_window = int(sampling_rate * window_size)
    no_of_windows = len(data) // no_of_data_per_window * 2 - 1
    #count = 0
    for i in range(no_of_windows):
        window_slice_data = []
        for j in range(no_of_data_per_window):
            window_slice_data.append(data[i * no_of_data_per_window // 2 + j])
        #count += 1
        #print(count)
        window_slice_data = np.asarray(window_slice_data)[:, 0:3].tolist()
        f_temp.append(features.extract(window_slice_data))
    l_temp = np.full(len(f_temp), table[csv_file])
    return f_temp, l_temp
Beispiel #39
0
def getData():

    if (os.path.exists('df_no_index_10mfcc.csv')):
        df = pd.read_csv('df_no_index_10mfcc.csv')
        df_y = df['genre']
        df_x = df.drop('genre', axis=1)
        return df_x, df_y

    # features haven't been created
    print("Create Dataset")
    signal, y = saveDataset.dataset()
    df_y = pd.Series(data=y)

    # construct features
    print("Feature Extraction")
    df_x = pd.DataFrame()
    for i in range(len(signal)):
        if (i % 50 == 0):
            print('audio {}'.format(i + 1))
        new_x = pd.DataFrame(features.extract(signal[i]), index=[i])
        df_x = df_x.append(new_x)

    saveDataset.saveFeature(df_x, pd.DataFrame(df_y, columns=['genre']))
    return df_x, df_y
Beispiel #40
0
def main():
    parser = util_run_experiment.get_argparser()
    args = parser.parse_args()
    assert args.targetlang in all_target_languages
    assert args.sourceword in all_words

    targetlang = args.targetlang
    sourceword = args.sourceword
    trialdir = args.trialdir
    stanford.taggerhome = args.taggerhome

    print("Loading and tagging test problems...")
    problems = util_run_experiment.get_test_instances(trialdir, sourceword)
    print("OK loaded and tagged.")

    ## classifier = get_maxent_classifier(sourceword, targetlang)
    classifier = get_pickled_classifier(sourceword, targetlang, "level1")
    if not classifier:
        print("Couldn't load pickled L1 classifier?")
        return
    print("Loaded pickled L1 classifier!")


    bestoutfn = "../L1output/{0}.{1}.best".format(sourceword, targetlang)
    oofoutfn = "../L1output/{0}.{1}.oof".format(sourceword, targetlang)
    with open(bestoutfn, "w") as bestoutfile, \
         open(oofoutfn, "w") as oofoutfile:
        for problem in problems:
            featureset = features.extract(problem)
            answer = classifier.classify(featureset)
            dist = classifier.prob_classify(featureset)
            oof_answers = util_run_experiment.topfive(dist)
            print(output_one_best(problem, targetlang, answer),
                  file=bestoutfile)
            print(output_five_best(problem, targetlang, oof_answers),
                  file=oofoutfile)
Beispiel #41
0
def generate(track):
    blob = extract(track.source, track.details_file)
    features.extract(blob, track)
    update_tags(blob, track)
Beispiel #42
0
    # collect val and time sequence from addresses

    dirPath = 'test_addr'
    addrs = os.listdir(dirPath)
    p = connectPSQL(psql)
    ''' 
    for addr in addrs:
        if addr[0]!='d':
            color.pImportant('addr file: '+addr)
            full_path = os.path.join(dirPath,addr)
            data_file = 'test_'+addr.split('.')[0].split('_')[1]+'_database.csv'
            data_file = os.path.join('result',data_file)
            in_csv = collectTxnIn(p,full_path)
            out_csv = collectTxnOut(p,full_path)
            deal_sql.deal_feature(in_csv, out_csv, data_file)
            feature.extract(data_file)
           
            os.rename(full_path,os.path.join(dirPath,'done-'+addr))
    '''
    addr = 'add_ponzi_train.csv'
    color.pImportant('addr file: ' + addr)
    full_path = os.path.join(dirPath, addr)
    data_file = 'test_' + addr.split('.')[0].split('_')[1] + '_database.csv'
    data_file = os.path.join('result', data_file)
    in_csv = collectTxnIn(p, full_path)
    out_csv = collectTxnOut(p, full_path)
    deal_sql.deal_feature(in_csv, out_csv, data_file)
    feature.extract(data_file)
    p.close()
Beispiel #43
0
#################################
meal.to_csv("combined_meal.csv", index=False, header=False)
nomeal.to_csv("combined_nomeal.csv", index=False, header=False)
#

###########################################
X = [29 - i for i in range(0, 30)]

#for i in range(10) :
#    plt.plot(X,meal.iloc[73,:],marker="X")
#

#print("AFTER FUNCTION")
###call function
feature_Matrix_Meal = pd.DataFrame()
feature_Matrix_Meal = extract(meal)

feature_Matrix_NoMeal = pd.DataFrame()
feature_Matrix_NoMeal = extract(nomeal)

#
#feature_Matrix_Meal['class']=1
#feature_Matrix_NoMeal['class']=0
#

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train_meal = sc.fit_transform(feature_Matrix_Meal)
train_nomeal = sc.fit_transform(feature_Matrix_NoMeal)
Beispiel #44
0
def ikObjectiveToFeatures(ikgoal,featureList):
    """Given an IKObjective instance and a list of features, returns a
    feature descriptor as a list of floats"""
    jsonObj = loader.toJson(ikgoal)
    return features.extract(jsonObj,featureList)
Beispiel #45
0
import model
import performance
import dataset

if __name__ == "__main__":
    # We begin by downloading the data. The data will be in the form of
    # "events" data: each datapoint for each patient will be a recorded event.
    X, Y = download.download()

    # The event data is reformatted. This is done by selecting the given
    # variables and  transforming time-dependent events to a path.
    X = reformat.reformat(X,
                          static_variables=["Age", "Gender"],
                          dynamic_variables=["Creatinine", "Glucose"])

    # Now, we normalise the data.
    X = normalise.normalise(X)

    # We extract features from the input data.
    features = features.extract(X)

    # The dataset is now split into a training and testing set.
    features_train, Y_train, features_test, Y_test = dataset.split(
        features, Y, proportion=0.75)

    # We now train the model with the selected features.
    classifier = model.train(features_train, Y_train)

    # We evaluate performance of the model now.
    performance.evaluate(classifier, features_test, Y_test)
Beispiel #46
0
    p = connectPSQL(psql)
    times = [time.time()]

    #for addr in addrs:
    for i in range(1):
        addr = 'new1.csv'
        color.pImportant('addr file: ' + addr)
        full_path = os.path.join(dirPath, addr)

        in_csv = collectTxnIn(p, full_path)
        out_csv = collectTxnOut(p, full_path)
        times.append(time.time())
        color.pImportant('collected all txns in ' + str(times[-1] - times[-2]))

        data_file = addr.split('.')[0] + '_database.csv'
        data_file = os.path.join('result', data_file)
        deal_sql.deal_feature(in_csv, out_csv, data_file)
        feature_file = feature.extract(data_file)
        times.append(time.time())
        color.pImportant('dealed all datas in ' + str(times[-1] - times[-2]))
        color.pImportant('')
        '''
        feature_df = pd.read_csv(feature_file)
        label_df = pd.read_csv(os.path.join(dirPath,label),header=None)
        label_df.columns=['ponzi']
        feature_df['ponzi'] = label_df['ponzi']
        feature_df.to_csv(feature_file,index=None)
        '''
    p.close()
    color.pImportant('total time used: ' + str(times[-1] - times[0]))
Beispiel #47
0
def ikObjectiveToFeatures(ikgoal, featureList):
    """Given an IKObjective instance and a list of features, returns a
    feature descriptor as a list of floats"""
    jsonObj = loader.toJson(ikgoal)
    return features.extract(jsonObj, featureList)
Beispiel #48
0
    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            a = 1
            # print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'

        while queue:
            x = features.extract(stack, queue, graph, FEATURE_NAMES, sentence)

            X_test = vec.transform(x)

            predicted_trans_index = model.predict(X_test)[0]
            predicted_trans = dict_classes[predicted_trans_index]

            # Build new graph
            stack, queue, graph, trans = execute_transition(
                stack, queue, graph, predicted_trans)

            # Save the predicted trans
            y_predicted_symbols.append(trans)

        stack, graph = transition.empty_stack(stack, graph)
Beispiel #49
0
    def testLetterBigrams(self):
      text = ""
      results = features.extract(text,[Features.LETTERBIGRAMS],False)
      result = results[Features.LETTERBIGRAMS]
      self.assertEqual(0, sum(result))
      results = features.extract(text,[Features.LETTERBIGRAMS],True)
      result = results[Features.LETTERBIGRAMS]
      self.assertEqual(0, sum(result))

      text = "eee"
      results = features.extract(text,[Features.LETTERBIGRAMS],False)
      result = results[Features.LETTERBIGRAMS]
      self.assertEqual(2, result['ee'])
      self.assertEqual(2, sum(result.values()))
      results = features.extract(text,[Features.LETTERBIGRAMS],True)
      result = results[Features.LETTERBIGRAMS]
      self.assertEqual(1.0, result['ee'])
      self.assertEqual(1.0, sum(result.values()))

      text = "eee eee e ee"
      results = features.extract(text,[Features.LETTERBIGRAMS],False)
      result = results[Features.LETTERBIGRAMS]
      self.assertEqual(5, result['ee'])
      self.assertEqual(5, sum(result.values()))
      results = features.extract(text,[Features.LETTERBIGRAMS],True)
      result = results[Features.LETTERBIGRAMS]
      self.assertEqual(1.0, result['ee'])
      self.assertEqual(1.0, sum(result.values()))

      text = "eekke eee e eze zzkzk"
      results = features.extract(text,[Features.LETTERBIGRAMS],False)
      result = results[Features.LETTERBIGRAMS]
      self.assertEqual(3, result['ee'])
      self.assertEqual(1, result['ek'])
      self.assertEqual(1, result['kk'])
      self.assertEqual(1, result['ke'])
      self.assertEqual(1, result['ez'])
      self.assertEqual(1, result['ze'])
      self.assertEqual(2, result['zk'])
      self.assertEqual(1, result['kz'])
      self.assertEqual(1, result['zz'])
      self.assertEqual(11+1, sum(result.values()))
      results = features.extract(text,[Features.LETTERBIGRAMS],True)
      result = results[Features.LETTERBIGRAMS]
      self.assertEqual(3/12.0, result['ee'])
      self.assertEqual(1/12.0, result['ek'])
      self.assertEqual(1/12.0, result['kk'])
      self.assertEqual(1/12.0, result['ke'])
      self.assertEqual(1/12.0, result['ez'])
      self.assertEqual(1/12.0, result['ze'])
      self.assertEqual(2/12.0, result['zk'])
      self.assertEqual(1/12.0, result['kz'])
      self.assertEqual(1/12.0, result['zz'])
      self.assertEqual(1.0, sum(result.values()))

      text = "ababababa"
      results = features.extract(text,[Features.LETTERBIGRAMS],False)
      result = results[Features.LETTERBIGRAMS]
      self.assertEqual(4, result['ab'])
      self.assertEqual(4, result['ba'])
      self.assertEqual(8, sum(result.values()))
      results = features.extract(text,[Features.LETTERBIGRAMS],True)
      result = results[Features.LETTERBIGRAMS]
      self.assertEqual(4/8.0, result['ab'])
      self.assertEqual(4/8.0, result['ba'])
      self.assertEqual(1.0, sum(result.values()))
      
      text = "ab \nn \tt"
      results = features.extract(text, [Features.LETTERBIGRAMS], False)
      result = results[Features.LETTERBIGRAMS]
      self.assertEqual(1.0, result['ab'])
      self.assertEqual(1.0, sum(result.values()))
Beispiel #50
0
    def testSaveAndLoad(self):
      text = "ekin and ekin."
      results = features.extract(text, 
      [Features.LETTERUNIGRAM, Features.LETTERBIGRAMS], False)

      features.save(results, "test1")
      returned = features.load("test1")
      os.remove("test1")

      self.assertEquals(results, returned)

      self.assertEqual(2, returned[Features.LETTERUNIGRAM][ord('e')-ord('a')])

      results = features.extract(text, 
      [Features.POSBIGRAMS, Features.LETTERBIGRAMS], True)
      features.save(results, "test2")
      returned = features.load("test2")
      os.remove("test2")

      self.assertEquals(results, returned)
      results = features.extract(text, 
      [Features.LETTERUNIGRAM, Features.LETTERBIGRAMS, Features.POSUNIGRAM, Features.POSBIGRAMS], True)
      features.save(results, "test3")
      returned = features.load("test3")
      os.remove("test3")

      self.assertEquals(results, returned)
      
      text = ""
      results = features.extract(text, 
      [Features.LETTERUNIGRAM, Features.LETTERBIGRAMS, Features.POSUNIGRAM, Features.POSBIGRAMS], False)
      
      features.save(results, "test4")
      returned = features.load("test4")
      os.remove("test4")

      self.assertEquals(results, returned)
      
      
      text = "\ttrying to test really long text\n\n \a and see if it still works or not \n 101 531 z3z3\n\t"
      text += text;
      text += text;
      results = features.extract(text, 
      [Features.LETTERUNIGRAM, Features.LETTERBIGRAMS, Features.POSUNIGRAM, Features.POSBIGRAMS], False)
      
      features.save(results, "test5")
      returned = features.load("test5")
      os.remove("test5")

      self.assertEquals(results, returned)
      
      text = ['test1', '222aaa', 'threee333', 't\test1'];
      results['authors'] = [0, 0, 1, 1]
      results['text'] = list();
      for t in text:
        results['text'].append(features.extract(t, [Features.LETTERUNIGRAM, Features.LETTERBIGRAMS, Features.POSUNIGRAM, Features.POSBIGRAMS], False))
      features.save(results, "test6");
      returned = features.load("test6");
      os.remove("test6")
          
      self.assertEqual(results['authors'], returned['authors'])
      
      for idx in range(0,len(returned['text'])):
          feat = features.extract(text[idx], [Features.LETTERUNIGRAM, Features.LETTERBIGRAMS, Features.POSUNIGRAM, Features.POSBIGRAMS], False)
          self.assertEqual(feat, returned['text'][idx])
Beispiel #51
0
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []

        #feature_names = ["stack_pos", "stack_word", "queue_pos", "queue_word", "can_ra", "can_la"]

        while queue:
            # print(queue)

            x = features.extract(stack, queue, graph, feature_names, sentence)
            X.append(
                dict(zip(feature_names, x))
            )  #Nu producerar denna en lång lista, där varje element är en mening. Ändra till att vare ord är ett eget element
            stack, queue, graph, trans = reference(stack, queue, graph)
            transitions.append(trans)
            y = trans

            y_vector.append(y)

        stack, graph = transition.empty_stack(stack, graph)
        #  print('Equal graphs:', transition.equal_graphs(sentence, graph))

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
Beispiel #52
0
    X = []
    y = []
    for sentence in formatted_corpus:
        sent_cnt += 1
        #if sent_cnt % 1000 == 0:
            #print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []
        while queue:
            X.append(features.extract(stack, queue, graph, feature_names, sentence))
            stack, queue, graph, trans = reference(stack, queue, graph)
            transitions.append(trans)
            y.append(trans)
        stack, graph = transition.empty_stack(stack, graph)
        #print('Equal graphs:', transition.equal_graphs(sentence, graph))

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
        #print(transitions)
        #print(graph)

    vec = DictVectorizer(sparse=True)

    modelFilename = model_name + '.sav'
Beispiel #53
0
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []

        x_templist = []
        y_templist = []

        while queue:
            current_dictX, current_Y = features.extract(
                stack, queue, graph, feature_1, sentence, 1)
            stack, queue, graph, trans = reference(stack, queue, graph)
            transitions.append(trans)

            x_templist.append(current_dictX)
            y_templist.append(current_Y)

        stack, graph = transition.empty_stack(stack, graph)

        for word in sentence:
            word['head'] = graph['heads'][word['id']]

        x_list.extend(x_temp_list)
        y_list.extend(y_temp_list)

    print("Encoding the features and classes...")
Beispiel #54
0
def build_instance(tagged_sentence, annotated, index):
    feat = features.extract(tagged_sentence, annotated, index)
    label = tagged_sentence[index][1]
    return (feat, label)
Beispiel #55
0
def process(
    file
):  # used by multiprocessor. Tensors are stored multiple times to reduce RAM usage.
    global gamecounter, kgsdatain, kgsdataout, kgsdatawin

    print file
    gamefile = open(file, 'r')
    gamecounter += 1
    if gamecounter == SAVEFILE:
        lock.acquire()
        newin = np.load(
            '/media/falk/6,0 TB Volume/19 badukmovies/kgsdatain_alllayer.npy')
        newout = np.load(
            '/media/falk/6,0 TB Volume/19 badukmovies/kgsdataout_alllayer.npy')
        newwin = np.load(
            '/media/falk/6,0 TB Volume/19 badukmovies/kgsdatawin_alllayer.npy')
        newin = np.concatenate([newin, kgsdatain]) if newin.size else kgsdatain
        newout = np.concatenate([newout, kgsdataout
                                 ]) if newout.size else kgsdataout
        newwin = np.concatenate([newwin, kgsdatawin
                                 ]) if newwin.size else kgsdatawin
        kgsdatain = np.array([])
        kgsdataout = np.array([])
        kgsdatawin = np.array([])
        print("Spielblock gespeichert")
        if len(newin) >= SAVEBIGFILE:
            metacounter = 0
            while os.path.isfile(
                    '/media/falk/6,0 TB Volume/19 badukmovies/pro/kgsdatain_alllayer_'
                    + str(metacounter) + '.npy'):
                metacounter += 1
            newinstr = '/media/falk/6,0 TB Volume/19 badukmovies/pro/kgsdatain_alllayer_' + str(
                metacounter) + '.npy'
            newoutstr = '/media/falk/6,0 TB Volume/19 badukmovies/pro/kgsdataout_alllayer_' + str(
                metacounter) + '.npy'
            newwinstr = '/media/falk/6,0 TB Volume/19 badukmovies/pro/kgsdatawin_alllayer_' + str(
                metacounter) + '.npy'
            # this is the final storage location
            np.save(newinstr, newin)
            np.save(newoutstr, newout)
            np.save(newwinstr, newwin)
            newout = np.array([])
            newin = np.array([])
            newwin = np.array([])
            print(
                "--------------- META Spielblock gespeichert--------------------"
            )
        np.save(
            '/media/falk/6,0 TB Volume/19 badukmovies/kgsdatain_alllayer.npy',
            newin)
        np.save(
            '/media/falk/6,0 TB Volume/19 badukmovies/kgsdataout_alllayer.npy',
            newout)
        np.save(
            '/media/falk/6,0 TB Volume/19 badukmovies/kgsdatawin_alllayer.npy',
            newwin)
        lock.release()
        gamecounter = 0

    gamefile = gamefile.read()
    gamefile = gamefile.replace("\n", "")
    gamefile = gamefile.replace("(;", "")
    game = gamefile
    boardpos = initboardpos.copy()

    if "HA[" in gamefile:  # if Handycap
        handycap = int(gamefile.split("HA[", 1)[1][0])  #write down Handycap
        if handycap >= 2:
            (boardpos[0, 15, 3], boardpos[0, 3, 15]) = (1, 1)
            (boardpos[2, 15, 3], boardpos[2, 3, 15]) = (0, 0)
        if handycap >= 3:
            boardpos[0, 15, 15] = 1
            boardpos[2, 15, 15] = 0
        if handycap >= 4:
            boardpos[0, 3, 3] = 1
            boardpos[2, 3, 3] = 0
        if handycap == 5:
            boardpos[0, 9, 9] = 1
            boardpos[2, 9, 9] = 0
        if handycap >= 6:
            (boardpos[0, 9, 3], boardpos[0, 9, 15]) = (1, 1)
            (boardpos[2, 9, 3], boardpos[0, 9, 15]) = (0, 0)
        if handycap == 7:
            boardpos[0, 9, 9] = 1
            boardpos[2, 9, 9] = 0
        if handycap >= 8:
            (boardpos[2, 3, 9], boardpos[0, 15, 9]) = (0, 0)
        if handycap == 9:
            boardpos[0, 9, 9] = 1
            boardpos[2, 9, 9] = 0


# main sgf disassembly begins here

    for sig, col, y, x in zip(game, game[1:], game[3:],
                              game[4:]):  # structure: ;B[ic]

        ######################## store prev board, feedback ##########################
        if sig == ";":
            if x not in alphabet: break  # error or pass: leave game
            if y not in alphabet: break  # error or pass: leave game
            if col not in ("B", "W"): break  # error: leave game
            xpos = alphabet.index(x)
            ypos = alphabet.index(y)
            storeboardpos = boardpos[None, :]
            kgsdatain = np.concatenate([kgsdatain, storeboardpos
                                        ]) if kgsdatain.size else storeboardpos
            # store to database (uses boardpos from last iteration)
            boardpos2 = boardpos[0:2].copy(
            )  # swap board for agent to always have color of plane 1
            boardpos2[1] = boardpos[0]  # 1st agent is b, 2nd w, 3rd b etc.
            boardpos[0] = boardpos[1]
            boardpos[1] = boardpos2[1]

            onemove = np.zeros([1, BOARD, BOARD],
                               dtype=int)  # add one in feedback board
            onemove[0, xpos, ypos] = 1
            onewin = np.array([1])

            if "RE[B" in game:  # b wins = 1
                onewin[0] = 1
            else:
                onewin[0] = 0

            kgsdatawin = np.concatenate(
                [kgsdatawin, onewin]) if kgsdatawin.size else onewin  # we lose
            kgsdataout = np.concatenate([kgsdataout, onemove
                                         ]) if kgsdataout.size else onemove
            ############################# add new move #################################

            boardpos = features.extract(boardpos, xpos, ypos,
                                        col)  # modifies all layers in boardpos
Beispiel #56
0
    def testLetterTrigrams(self):
      text = ""
      results = features.extract(text,[Features.LETTERTRIGRAMS],False)
      result = results[Features.LETTERTRIGRAMS]
      self.assertEqual(0, sum(result.values()))
      results = features.extract(text,[Features.LETTERTRIGRAMS],True)
      result = results[Features.LETTERTRIGRAMS]
      self.assertEqual(0, sum(result.values()))

      text = "eee"
      results = features.extract(text,[Features.LETTERTRIGRAMS],False)
      result = results[Features.LETTERTRIGRAMS]
      self.assertEqual(1, result['eee'])
      self.assertEqual(1, sum(result.values()))
      results = features.extract(text,[Features.LETTERTRIGRAMS],True)
      result = results[Features.LETTERTRIGRAMS]
      self.assertEqual(1.0, result['eee'])
      self.assertEqual(1.0, sum(result.values()))

      text = "eee eee e ee"
      results = features.extract(text,[Features.LETTERTRIGRAMS],False)
      result = results[Features.LETTERTRIGRAMS]
      self.assertEqual(2, result['eee'])
      self.assertEqual(2, sum(result.values()))
      results = features.extract(text,[Features.LETTERTRIGRAMS],True)
      result = results[Features.LETTERTRIGRAMS]
      self.assertEqual(1.0, result['eee'])
      self.assertEqual(1.0, sum(result.values()))

      text = "eekke eee e ezee zzkzkz"
      results = features.extract(text,[Features.LETTERTRIGRAMS],False)
      result = results[Features.LETTERTRIGRAMS]
      self.assertEqual(1, result['eek'])
      self.assertEqual(1, result['ekk'])
      self.assertEqual(1, result['kke'])
      self.assertEqual(1, result['eee'])
      self.assertEqual(1, result['eze'])
      self.assertEqual(1, result['zee'])
      self.assertEqual(1, result['zzk'])
      self.assertEqual(2, result['zkz'])
      self.assertEqual(1, result['kzk'])
      self.assertEqual(10, sum(result.values()))
      results = features.extract(text,[Features.LETTERTRIGRAMS],True)
      result = results[Features.LETTERTRIGRAMS]
      self.assertEqual(1/10.0, result['eek'])
      self.assertEqual(1/10.0, result['ekk'])
      self.assertEqual(1/10.0, result['kke'])
      self.assertEqual(1/10.0, result['eee'])
      self.assertEqual(1/10.0, result['eze'])
      self.assertEqual(1/10.0, result['zee'])
      self.assertEqual(1/10.0, result['zzk'])
      self.assertEqual(2/10.0, result['zkz'])
      self.assertEqual(1/10.0, result['kzk'])
      self.assertEqual(10/10.0, round(sum(result.values())))

      text = "ababababa"
      results = features.extract(text,[Features.LETTERTRIGRAMS],False)
      result = results[Features.LETTERTRIGRAMS]
      self.assertEqual(4, result['aba'])
      self.assertEqual(3, result['bab'])
      self.assertEqual(7, sum(result.values()))
      results = features.extract(text,[Features.LETTERTRIGRAMS],True)
      result = results[Features.LETTERTRIGRAMS]
      self.assertEqual(4/7.0, result['aba'])
      self.assertEqual(3/7.0, result['bab'])
      self.assertEqual(1.0, sum(result.values()))
      
      text = "ab \nn \ttzx"
      results = features.extract(text, [Features.LETTERTRIGRAMS], False)
      result = results[Features.LETTERTRIGRAMS]
      self.assertEqual(1.0, result['tzx'])
      self.assertEqual(1.0, sum(result.values()))