Example #1
0
    total = 0
    wrong = 0

    sentence = None
    with open(args.infile) as f:
        for line in f:
            if sentence:
                total += 1
                correct_edge = hedge(line.strip())
                parser_output = parser.parse(sentence)
                parsed_sentence = parser_output['parses'][0]
                edge = parsed_sentence['main_edge']
                sent = parsed_sentence['spacy_sentence']
                if edge != correct_edge:
                    wrong += 1
                    print_tree(sent.root)
                    print('expected:')
                    print(correct_edge)
                    print('result:')
                    print(edge)
                sentence = None
            else:
                sentence = line.strip()

    print('%s wrong out of %s.' % (wrong, total))


if __name__ == '__main__':
    wrapper(test_parser, text='parser tests')
Example #2
0
import random
from graphbrain.cli import wrapper
from graphbrain.parsers import create_parser


def extract_sentences(args):
    parser = create_parser(name=args.lang)
    sentences = []

    count = 0
    with open(args.infile, 'r') as infile, open(args.outfile, 'w') as outfile:
        for line in infile.readlines():
            paragraph = line.strip()
            if len(paragraph) > 0:
                parse_results = parser.parse(paragraph)
                for parse in parse_results['parses']:
                    sentences.append(parse['text'])
                    count += 1
                    if count % 100 == 0:
                        print('{} sentences found'.format(count))

    random.shuffle(sentences)

    with open(args.outfile, 'w') as outfile:
        for sentence in sentences:
            outfile.write('{}\n'.format(sentence))


if __name__ == '__main__':
    wrapper(extract_sentences, text='extract sentences')
                    remaining += 1
                    source_name = source[:-1]
                    sources[source_name] += 1
                    sentences_by_source[source_name].append(sentence)
            else:
                empty += 1
                if empty > len(self.input_files):
                    break

        self._close_input_files()

        # write files per source
        for source in sentences_by_source:
            sentences = sentences_by_source[source]
            random.shuffle(sentences)
            with open(join(outdir, '{}.csv'.format(source)), 'w') as f:
                for sentence in sentences:
                    f.write('{}\n'.format(sentence))

        print('existing: {}; remaining: {}'.format(len(self.sentences),
                                                   remaining))
        print(sources)


def extract_remaining_sentences(args):
    SentenceExtractor().extract(args.indir, args.infile, args.outdir)


if __name__ == '__main__':
    wrapper(extract_remaining_sentences, text='extract remaining sentences')
                f = self.input_files[name]
                yield f.readline(), name

    def generate(self, indir, outfile):
        self._open_input_files(indir)

        self.load_sentences(outfile)

        with open(outfile, 'a') as outfile:
            for sentence, source in self._sentences():
                sentence = sentence.strip()
                if sentence not in self.sentences:
                    self.print_counts()
                    case = None
                    while case is None:
                        case = self.annotate_sentence(sentence, source)
                    outfile.write('{}\n'.format(json.dumps(case)))
                    self.sentences.add(sentence)
                    self.update_counts(case)

        self._close_input_files()


def generate_parser_training_data(args):
    TrainingDataGenerator(args.lang).generate(args.indir, args.outfile)


if __name__ == '__main__':
    wrapper(generate_parser_training_data,
            text='generate parser training data')
Example #5
0
            pos_after = row[19]
            source = row[25][:-1]
            X.append((tag, dep, hpos, hdep, pos_after))
            y.append(true_value)
            sources.append(source)
    preds = alpha.predict(X)

    n_source = Counter()
    correct_source = Counter()

    for pred, true_value, source in zip(preds, y, sources):
        n += 1
        n_source[source] += 1
        if pred == true_value:
            correct += 1
            correct_source[source] += 1

    for source in n_source:
        accuracy = float(correct_source[source]) / float(n_source[source])
        print('{} accuracy: {} [{} correct out of {}]'.format(
            source, accuracy, correct_source[source], n_source[source]))

    print()
    accuracy = float(correct) / float(n)
    print('overall accuracy: {} [{} correct out of {}]'.format(
        accuracy, correct, n))


if __name__ == '__main__':
    wrapper(test_alpha, text='test alpha classifier')
Example #6
0
    def run(self):
        new_features = ALL_FEATURES
        cur_features = None
        i = 1

        # ablation stage
        while new_features != cur_features:
            self._log('\n>>> ITERATION {} <<<'.format(i))
            i += 1
            cur_features = new_features
            new_features = self._ablate(cur_features)

        # regrowth stage
        cur_features = None
        while new_features != cur_features:
            self._log('\n>>> ITERATION {} <<<'.format(i))
            i += 1
            cur_features = new_features
            new_features = self._regrow(cur_features)


def select_alpha_features(args):
    infile = args.infile
    outfile = args.outfile
    FeatureSelector(infile, outfile).run()


if __name__ == '__main__':
    wrapper(select_alpha_features, text='select features for alpha classifier')
Example #7
0
            delta_t = time.time() - start_t
            self.time_acc += delta_t
            items_per_min = float(self.items_processed) / float(self.time_acc)
            items_per_min *= 60.
            # print('total items: %s' % self.items_processed)
            # print('items per minute: %s' % items_per_min)
        self.items_processed += 1

    def parse_file(self, filename):
        lines = file_lines(filename)
        i = 0
        with progressbar.ProgressBar(max_value=lines) as bar:
            with open(filename, 'r') as f:
                for line in f:
                    post = json.loads(line)
                    self.parse_post(post)
                    i += 1
                    bar.update(i)

        print('main edges created: %s' % self.main_edges)
        print('extra edges created: %s' % self.extra_edges)


def _parse(args):
    hgraph = hypergraph(args.hg)
    RedditParser(hgraph).parse_file(args.infile)


if __name__ == '__main__':
    wrapper(_parse, text='reddit parser')
                        word_after = str(spacy_sentence[i + 1])
                        pos_after = spacy_sentence[i + 1].pos_
                        dep_after = spacy_sentence[i + 1].dep_
                        if spacy_sentence[i + 1].pos_ == 'PUNCT':
                            punct_after = True

                    head = token.head
                    is_root = head is None
                    has_lefts = token.n_lefts > 0
                    has_rights = token.n_rights > 0
                    outfile.write(('{}' + '\t{}' * 23 + '\n').format(
                        hedge(atom).type()[0], str(token), token.pos_,
                        token.tag_, token.dep_,
                        str(head) if head else '', head.pos_ if head else '',
                        head.tag_ if head else '', head.dep_ if head else '',
                        is_root, has_lefts, has_rights, token.ent_type_,
                        token.shape_[:2], word_before, word_after,
                        punct_before, punct_after, pos_before, pos_after,
                        dep_before, dep_after, case['correct'],
                        case['source']))
            else:
                failed_parses += 1
            print('sentences: {}; ignored: {}; failed: {}; atoms: {}'.format(
                total_sentences, ignored_sentences, failed_parses,
                total_atoms))
        print('done.')


if __name__ == '__main__':
    wrapper(generate_alpha_training_data, text='generate alpha training data')
            print(colored(str(he), 'white'))

            sentence = line.strip()

            if sentence not in sentences:
                sentences.append(sentence)
                parser_output = parser.parse(sentence)
                parsed_sentence = parser_output['parses'][0]
                edge = parsed_sentence['main_edge']

                if edge:
                    print('\n{}\n{}\n'.format(sentence, indented(edge)))

                    answer = he.input()
                    if answer == 'd':
                        defects = input_defects(sentence, edge)
                    else:
                        defects = []
                    he.apply_evaluation(answer, edge, defects)

                    defect_str = '&'.join(
                        [defect.to_str() for defect in defects])
                    row_str = '\t'.join(
                        (sentence, edge.to_str(), answer, defect_str))
                    with open(args.outfile, 'a') as of:
                        of.write('{}\n'.format(row_str))


if __name__ == '__main__':
    wrapper(manual_test, text='manual test of parser')
                cases.append(case)

    random.shuffle(cases)

    n_cases = len(cases)
    n_test = int(n_cases / 3)

    train_cases = cases[n_test:]
    test_cases = cases[:n_test]

    file_main_name = infile[:-5]
    train_file_name = '{}-train.json'.format(file_main_name)
    test_file_name = '{}-test.json'.format(file_main_name)

    with open(train_file_name, 'wt') as f:
        for case in train_cases:
            f.write('{}\n'.format(json.dumps(case)))

    with open(test_file_name, 'wt') as f:
        for case in test_cases:
            f.write('{}\n'.format(json.dumps(case)))

    print('{} total cases found'.format(n_cases))
    print('wrote {} train cases to {}'.format(len(train_cases),
                                              train_file_name))
    print('wrote {} test cases to {}'.format(len(test_cases), test_file_name))


if __name__ == '__main__':
    wrapper(split_parser_training_data, text='split parser training data')
Example #11
0
from graphbrain import *
from graphbrain.cli import wrapper
from graphbrain.parsers import *


def update_tests(args):
    parser = create_parser(name=args.lang, resolve_corefs=False)

    total = 0

    sentence = None
    with open(args.infile) as f_in:
        with open(args.outfile, 'w') as f_out:
            for line in f_in:
                if sentence:
                    total += 1
                    parser_output = parser.parse(sentence)
                    parsed_sentence = parser_output['parses'][0]
                    edge = parsed_sentence['main_edge']
                    f_out.write('{}\n{}\n'.format(sentence, edge.to_str()))
                    sentence = None
                else:
                    sentence = line.strip()

    print('Total cases processed: {}.'.format(total))


if __name__ == '__main__':
    wrapper(update_tests, text='update tests')