Example #1
0
def fromText2Image(filedir='dataset'):
    assert os.path.exists(
        filedir), 'file %s direction does not exit.' % (filedir)
    textfiles = {
        'acceleration': '%s/text/acceleration' % filedir,
        # 'underfitting': '%s/text/underfitting' % filedir
    }
    for key, value in textfiles.items():
        # training
        aim_path = '%s/training/' % (value.replace('text', 'image'))
        utils.mkdir(aim_path)
        trainingText = utils.read_text(filename='%s/training.txt' %
                                       (value)).split('\n')
        if not len(os.listdir(aim_path)) == 2 * len(trainingText):
            print 'generating %s image' % (aim_path)
            mp_generate(textlines=trainingText,
                        aim_path=aim_path,
                        threads_num=8)
            del trainingText
        else:
            pass

        # test
        aim_path = '%s/test/' % (value.replace('text', 'image'))
        utils.mkdir(aim_path)
        testText = utils.read_text(filename='%s/test.txt' %
                                   (value)).split('\n')
        if not len(os.listdir(aim_path)) == 2 * len(testText):
            print 'generating %s image' % (aim_path)
            mp_generate(textlines=testText, aim_path=aim_path, threads_num=8)
            del testText
        else:
            pass
Example #2
0
File: test.py Project: snnclsr/nmt
def main():

    arg_parser = argparse.ArgumentParser(
        description="Neural Machine Translation Testing")
    arg_parser.add_argument("--model_file", required=True, help="Model File")
    arg_parser.add_argument("--valid_data",
                            required=True,
                            nargs="+",
                            help="Validation_data")

    args = arg_parser.parse_args()
    args = vars(args)
    print(args)
    model = Seq2Seq.load(args["model_file"])
    print(model)
    model.device = "cpu"

    tr_dev_dataset_fn, en_dev_dataset_fn = args["valid_data"]
    tr_valid_data = read_text(tr_dev_dataset_fn)
    en_valid_data = read_text(en_dev_dataset_fn)

    valid_data = list(zip(tr_valid_data, en_valid_data))

    src_valid, tgt_valid = add_start_end_tokens(valid_data)

    hypotheses = beam_search(model,
                             src_valid,
                             beam_size=3,
                             max_decoding_time_step=70)
    top_hypotheses = [hyps[0] for hyps in hypotheses]
    bleu_score = compute_corpus_level_bleu_score(tgt_valid, top_hypotheses)
    print('Corpus BLEU: {}'.format(bleu_score * 100))
Example #3
0
def main(argv=None):

    org_dic, _ = load_dictionary(FLAGS.data, MAX_VOCAB_SIZE, FLAGS.data_dir)
    train_texts, train_labels = read_text("%s/train" % FLAGS.data,
                                          FLAGS.data_dir)
    test_texts, test_labels = read_text("%s/test" % FLAGS.data, FLAGS.data_dir)

    train_seqs, train_seqs_mask = text_encoder(train_texts, org_dic,
                                               config.word_max_len[FLAGS.data])
    test_seqs, test_seqs_mask = text_encoder(test_texts, org_dic,
                                             config.word_max_len[FLAGS.data])
    print("Dataset ", FLAGS.data, " loaded!")
    glove_embedding_matrix = np.load(FLAGS.data_dir +
                                     "aux_files/embeddings_glove_%s_%d.npy" %
                                     (FLAGS.data, MAX_VOCAB_SIZE))
    dist_mat = np.load(FLAGS.data_dir +
                       "aux_files/small_dist_counter_%s_%d.npy" %
                       (FLAGS.data, MAX_VOCAB_SIZE))
    for stop_word in config.stop_words:
        if stop_word in org_dic:
            dist_mat[org_dic[stop_word], :, :] = 0

    train_log = train(
        train_seqs,
        train_seqs_mask,
        train_labels,
        test_seqs,
        test_seqs_mask,
        test_labels,
        glove_embedding_matrix,
        dist_mat,
        config.num_classes[FLAGS.data],
    )
Example #4
0
    def __init__(self, split=None, transform_function=None):

        self.path = "path-of-trainging-dataset"
        self.transform_function = transform_function

        fname_path = self.path
        path_pointJSON = "%s/pointDict.json" % self.path

        if split == "train":
            self.imgNames = [
                t.replace(".jpg\n", "")
                for t in ut.read_text(fname_path + "/train.txt")
            ]

        elif split == "val":
            self.imgNames = [
                t.replace(".jpg \n", "")
                for t in ut.read_text(fname_path + "/train_val.txt")
            ]
        elif split == "test":
            self.imgNames = [
                t.replace("\n", "")
                for t in ut.read_text(fname_path + "/test.txt")
            ]

        if os.path.exists(path_pointJSON):
            self.pointsJSON = ut.load_json(path_pointJSON)

        else:
            pointDict = get_pointDict(self.path, self.imgNames)
            ut.save_json(path_pointJSON, pointDict)

        self.split = split
        self.n_classes = 2
Example #5
0
    def __init__(self, split=None, transform_function=None):

        self.path = path = "//mnt/datasets/public/issam/VOCdevkit/VOC2007/"
        self.transform_function = transform_function

        fname_path = "%s/ImageSets/Main" % path
        path_pointJSON = "%s/pointDict.json" % path

        if split == "train":
            self.imgNames = [
                t.replace("\n", "")
                for t in ut.read_text(fname_path + "/train.txt")
            ]

        elif split == "val":
            self.imgNames = [
                t.replace("\n", "")
                for t in ut.read_text(fname_path + "/val.txt")
            ]
        elif split == "test":
            self.imgNames = [
                t.replace("\n", "")
                for t in ut.read_text(fname_path + "/test.txt")
            ]

        if os.path.exists(path_pointJSON):
            self.pointsJSON = ut.load_json(path_pointJSON)
        else:
            pointDict = get_pointDict(path, self.imgNames)
            ut.save_json(path_pointJSON, pointDict)

        # for j, key in enumerate(pointDict):
        #   print(j)
        #   pList1 = pointDict[key]
        #   pList2 = self.pointsJSON[key]

        #   for p1 in pList1:
        #     y, x = p1["y"], p1["x"]
        #     flag = False
        #     for p2 in pList2:
        #       y2, x2 = p2["y"], p2["x"]
        #       if y == y2 and x == x2:
        #         flag = True
        #         break
        #     assert flag == True

        self.split = split
        self.n_classes = 21
Example #6
0
def __validate_mentions(doc_list_file, mentions, dst_miss_match_file):
    print 'checking miss match'
    doc_mentions = Mention.arrange_mentions_by_docid(mentions)
    doc_paths = load_doc_paths(doc_list_file)
    doc_head = '<?xml version="1.0" encoding="utf-8"?>\n'
    miss_match_cnt = 0
    fout = open(dst_miss_match_file, 'wb')
    for doc_path in doc_paths:
        docid = doc_id_from_path(doc_path)
        cur_doc_mentions = doc_mentions.get(docid, list())
        if not cur_doc_mentions:
            continue

        doc_text = read_text(doc_path, True)
        if doc_text.startswith(doc_head):
            doc_text = doc_text[len(doc_head):]

        for m in cur_doc_mentions:
            name_in_doc = doc_text[m.beg_pos:m.end_pos + 1]
            if name_in_doc != m.name:
                miss_match_cnt += 1
                fout.write('%s\t%s\t%d\t%d\t%s\n' % (docid, m.name.encode('utf-8'), m.beg_pos, m.end_pos,
                                                     name_in_doc.encode('utf-8')))
                # print '%s\t%s\t%d\t%d\t%s' % (docid, m.name, m.beg_pos, m.end_pos, name_in_doc)
    fout.close()
    print miss_match_cnt, 'miss match'
def write_cited_contexts(doc_ids, index_folder, files_folder):

	contexts = defaultdict(list)
	for doc_id in progress(doc_ids):
		citations = get_cited_papers(doc_id)

		for cited, start, end in citations :
			text = utils.read_text(doc_id)

			contexts[cited].append(find_sentence(text, start, end))

# 		if len(contexts) > 100000: break

	fields = [pylucene.DocField("id", stored=True, indexed=False), 
						pylucene.DocField("contexts", stored=False, indexed=True)]
	index = pylucene.Index(index_folder, fields)


	print "Writing contexts to file for %d documents." % len(contexts)
	for i, (doc_id, ctxs) in enumerate(contexts.items()) :

		text = u"\n".join(ctxs)
		index.add(id=doc_id, contexts=text)

		# Commit and print progress every 1K entries 
		if i%1000==0 and i: 
			index.commit()
			print "%d documents indexed and written to disk." % i

		# Also write contexts into files
		with open(os.path.join(files_folder, "%s.txt"%doc_id), "w") as f :
			print >> f, text.encode("UTF-8")

	index.close()
Example #8
0
 def gen(batch_size=50):
     current_index = batch_size
     index = ['/01%04d' % i for i in xrange(batch_size)]
     while 1:
         X = np.asanyarray(
             [utils.read_npy(traingPath + i + '.npy') for i in index],
             dtype=NP_DTYPE)
         str2index = lambda line: [chrst[i] for i in line]
         y = np.asanyarray([
             str2index(line) for line in
             [utils.read_text(traingPath + i + '.txt') for i in index]
         ],
                           dtype=NP_DTYPE)
         if current_index + batch_size > trainingFileNum:
             index = range(current_index, trainingFileNum) + range(
                 0, batch_size + current_index - trainingFileNum)
             index = ['/01%04d' % i for i in index]
             current_index = batch_size + current_index - trainingFileNum
         else:
             index = [
                 '/01%04d' % i
                 for i in range(current_index, current_index + batch_size)
             ]
             current_index += batch_size
         yield [
             X, y,
             np.ones(batch_size) * int(width - 2),
             np.ones(batch_size) * n_len
         ], np.ones(batch_size)
Example #9
0
    def __init__(self, root="",split=None, 
                 transform_function=None):
        self.split = split
        
        self.n_classes = 2
        self.transform_function = transform_function
        
        ############################
        # self.path_base = "/home/tammy/LCFCN/datasets/TRANCOS_v3"
        self.path_base ="/floyd/input/logcounting"
        # self.path_base = "/mnt/datasets/public/issam/Trancos/"

        if split == "train":
            fname = self.path_base + "/image_sets/training.txt"

        elif split == "val":
            fname = self.path_base + "/image_sets/validation.txt"

        elif split == "test":
            fname = self.path_base + "/image_sets/test.txt"

        self.img_names = [name.replace(".jpg\n","") for name in ut.read_text(fname)]
        self.path = self.path_base + "/images/"
        self.path_dots = self.path_base + "/dots/"
        assert os.path.exists(self.path + self.img_names[0] + ".jpg")
Example #10
0
def extend_matrix(sourceDir, targetDir, encodingMethod=1):
    utils.mkdir(targetDir)

    print 'extending matrix of %s ...' % (sourceDir)
    index = sorted([fl for fl in os.listdir(sourceDir) if fl.endswith('txt')])
    extendData = [
        sourceDir + '/' + fl
        for fl in sorted(index, key=lambda name: int(name.split('.')[0]))
    ]

    widthDict = utils.width_dict(filename='material/width.txt')
    for i, fl in enumerate(extendData):
        tmpMatrix = np.zeros(shape=(1, maxlen, chrstlen), dtype=float)
        sntc = utils.read_text(fl)
        for j, char in enumerate(sntc):
            tmpMatrix[0, j, chrst[char]] = 1.0
        extendMatrix = model.predict(tmpMatrix)[0]
        # print ''.join([chrst_reversed[ch] for ch in [np.argmax(line) for line in extendMatrix]])
        mat = utils.extend_matrix(original=extendMatrix,
                                  length=600,
                                  string=sntc,
                                  widthDict=widthDict,
                                  chrst=chrst,
                                  encodingMethod=encodingMethod)
        utils.write_npy(filename='%s/%s.npy' %
                        (targetDir, index[i].split('.')[0]),
                        mat=mat)
Example #11
0
 def input_text(self):
     if not self.edition or not self.user:
         raise quit()
     print(
         '\ntype in the review. (exit by typing two continuous line breaks.)\n'
     )
     self.text = utils.read_text()
     print('\nthanks.')
Example #12
0
 def gen_for_test(start, batch_size=50):
     index = ['/01%04d' % i for i in xrange(start, start + batch_size)]
     while 1:
         X = np.asanyarray(
             [utils.read_npy(testPath + i + '.npy') for i in index],
             dtype=NP_DTYPE)
         y = [utils.read_text(testPath + i + '.txt') for i in index]
         yield X, y
Example #13
0
def build(name) -> [str, None]:
    school = cfg_schools.data[name]
    ext_name = school.get('section-times')
    if ext_name is None:
        return None
    src_path = os.path.join(SRC_DIR, name + '.' + ext_name)
    text = utils.read_text(src_path)
    text = text.strip().replace('\n', SPACE)
    text = SPACE + 'sectionTimes: ' + text
    return TAG_BEFORE + text + TAG_AFTER
Example #14
0
def main():
    readme = utils.read_text(README_NAME)
    start = readme.index(TAG_ADAPTED_TABLE_START) + len(
        TAG_ADAPTED_TABLE_START)
    end = readme.index(TAG_ADAPTED_TABLE_END)
    with utils.open_text(README_NAME, 'w') as f:
        f.write(readme[:start])
        f.write('\n')
        f.write(create_adapted_table())
        f.write(readme[end:])
    def make_vocab(cls, data_path, save_path):
        texts = read_text(data_path)
        words = [word for text in tqdm(texts) for word in cls.tokenizer.morphs(preprocess_text(text))]
        word_counter = Counter(words)

        vocab = {"[PAD]": 0, "[UNK]": 1}
        idx = 2
        for word, count in word_counter.most_common():
            vocab[word] = idx
            idx += 1
        save_json(save_path, vocab)
Example #16
0
def main():
    try:
        path = sys.argv[1]
    except IndexError:
        print("Error: No input files")
        sys.exit()

    text = read_text(path)
    # print
    p = Parser()
    info = p.parse(text)
    pprint(info)
Example #17
0
	def make_dataset(self, data_dir, save_dir):
		data = utils.read_text(data_dir)
		f = open(save_dir, 'wb')
		print('generating sentences')
		sents = utils.generate_sentences(data)
		print('converting sentences to code points')
		sents_cp = utils.convert_sent_to_id(sents)
		for i in tqdm(sents_cp):
			x = i[:-1]
			y = i[1:]
			pkl.dump((x, y), f)
		f.close()
		print('done')
Example #18
0
def __extract_nom_mentions(nom_dict_file, doc_list_file, words_pos_file, dst_nom_mentions_file):
    noms = load_nom_dict(nom_dict_file)
    nom_name_list = [n for n in noms]
    nom_name_list.sort(key=lambda x: -len(x))
    nom_name_list = [n.split(' ') for n in nom_name_list]

    doc_path_dict = __load_doc_paths_as_dict(doc_list_file)

    mentions = list()
    f_wp = open(words_pos_file, 'r')
    for i, line in enumerate(f_wp):
        vals = line.rstrip().split('\t')
        docid = vals[0]

        if (i + 1) % 10 == 0:
            print i + 1, docid

        doc_path = doc_path_dict[docid]
        doc_text = read_text(doc_path).decode('utf-8')
        if doc_text.startswith(doc_head):
            doc_text = doc_text[len(doc_head):]

        num_sentences = int(vals[1])
        for j in xrange(num_sentences):
            sentence = __next_sentence_in_words_pos_file(f_wp)
            words = [tup[0].lower() for tup in sentence]
            # print words
            hit_spans, hit_indices = find_phrases_in_words(nom_name_list, words, False)
            for hit_span, hit_idx in izip(hit_spans, hit_indices):
                beg_pos = sentence[hit_span[0]][3]
                end_pos = sentence[hit_span[1] - 1][4]

                tags = [tup[2] for tup in sentence[hit_span[0]:hit_span[1]]]
                # print tags
                # if 'NN' not in tags and 'NNP' not in tags:
                #     continue
                if 'NN' not in tags:
                    continue

                name = doc_text[beg_pos:end_pos + 1].replace('\n', ' ')
                if '&lt;' in name or 'http:' in name or '&gt;' in name:
                    continue
                m = Mention(name=name, beg_pos=beg_pos, end_pos=end_pos, docid=docid, mention_type='NOM',
                            entity_type='PER', kbid='NIL00000')
                mentions.append(m)
                # print sentence[hit_span[0]], sentence[hit_span[1]]
                # print nom_name_list[hit_idx], name
        # break
    f_wp.close()

    Mention.save_as_edl_file(mentions, dst_nom_mentions_file)
Example #19
0
def build_fs_dict(token_date_dir, result_dir, fs_percent=1):
    PMI_SO = result_dir + os.sep + 'PMI-SO'
    print 'Reading text...'
    doc_str_list_token, doc_class_list_token = utils.read_text([token_date_dir + os.sep + x for x in FNAME_LIST], SAMP_TAG)
    print 'End Reading'
    doc_terms_list_train = utils.get_doc_terms_list(doc_str_list_token)
    class_set = utils.get_class_set(doc_class_list_token)
    term_set = utils.get_term_set(doc_terms_list_train)
    
    print 'PMI-SO Sentiment Lexicon Construction...'
    df_term = utils.stat_df_term(term_set, doc_terms_list_train)
    df_class = utils.stat_df_class(class_set, doc_class_list_token)
    df_term_class = utils.stat_df_term_class(term_set, class_set, doc_terms_list_train, doc_class_list_token)
    term_set_fs, term_score_list = utils.feature_selection_mi(df_class, df_term_class)
    save_score_list(term_score_list, term_set_fs, PMI_SO)
Example #20
0
File: dn.py Project: meng89/nikaya
def make_nikaya(sutra_urls):

    nikaya = _MyNikaya()
    nikaya.title_chinese = '長部'
    nikaya.title_pali = 'Digha Nikāya',
    nikaya.abbreviation = 'DN'

    for url in sutra_urls:

        chinese, pali, modified = read_text(url)

        header_lines, main_lines = split_chinese_lines(chinese)

        info = analyse_header(header_lines)

        if info.pin_title is not None:
            if not nikaya.pins or nikaya.pins[-1].title != info.pin_title:

                pin = _Pin()
                pin.serial = info.pin_serial
                pin.title = info.pin_title

                nikaya.pins.append(pin)

        sutra = Sutra()

        sutra.serial_start = info.sutra_serial_start
        sutra.serial_end = info.sutra_serial_end

        sutra.pali = pali
        sutra.chinese = chinese

        sutra.main_lines = main_lines

        sutra.modified = modified

        sutra.serial = sutra.serial_start

        sutra.title = info.sutra_title

        sutra.sec_title = sutra.serial + ' ' + sutra.title

        sutra.abbreviation = '{}.{}'.format(nikaya.abbreviation, sutra.serial)

        nikaya.pins[-1].sutras.append(sutra)

    return nikaya
Example #21
0
    def load_data(self, trainingNum=None, testNum=None):
        self.trainingNum = trainingNum
        self.testNum = testNum

        text = utils.read_text(filename=self.filename)
        sentences = [
            text[i:i + self.maxlen] for i in xrange(0, len(text), self.step)
        ]
        sentences = [sntc for sntc in sentences if len(sntc) == self.maxlen]
        random.shuffle(sentences)
        if trainingNum == None:
            pass
        else:
            self.traingData = sentences[:self.trainingNum]

        if testNum == None:
            pass
        else:
            self.testData = sentences[-self.testNum:]

        del sentences
    def make_vocab(cls, data_path, save_path):
        data = read_text(data_path)
        texts = make_texts(data)

        letters = []
        for text in tqdm(texts):
            text = preprocess_text(text)
            for char in text:
                try:
                    ls = hgtk.letter.decompose(char)
                except:
                    ls = ["[NUM]", "[NUM]", "[NUM]"]
                letters.extend(ls)
        letter_counter = Counter(letters)
        vocab = {"[PAD]": 0, "[UNK]": 1}
        idx = 2
        for char, count in letter_counter.most_common():
            vocab[char] = idx
            idx += 1

        save_json(save_path, vocab)
Example #23
0
def get_tokens_per_citation(doc_id):
    '''
	Fetches all cited papers by paper 'doc_id', gets the contexts around these citations and
	return them in a dict structure {cited_paper_id: [token1, token2, ..., tokenN]}. If a paper
	is cited at more than one location then the tokens for each contexts are merged together.
	'''
    citations = contexts.get_cited_papers(doc_id)

    text = utils.read_text(doc_id)

    tokens_per_citation = defaultdict(list)
    ctxs = {}
    for cited, start, end in citations:

        # Only process citation if cited paper is known (cited != None)
        if cited:
            if (start, end) not in ctxs:
                ctxs[(start, end)] = tokenizer.tokenize(
                    contexts.find_sentence(text, start, end))

            tokens_per_citation[cited] += ctxs[(start, end)]

    return tokens_per_citation
Example #24
0
def main(input_file, vocabulary_file):
    """Automatically check and correct the spelling of a file."""
    vocabulary = utils.read_vocabulary(vocabulary_file)
    logging.info("Read %i words.", len(vocabulary))
    text = utils.read_text(input_file)
    check(text, vocabulary)
Example #25
0
nb_epochs = 100
train_batch_size = 128
val_batch_size = 256
sample_mode = 'argmax'

reverse = True

data_path = './data'
train_books = ['nietzsche.txt', 'pride_and_prejudice.txt',
               'shakespeare.txt', 'war_and_peace.txt']
val_books = ['wonderland.txt']


if __name__ == '__main__':
    # Prepare training data.
    text  = read_text(data_path, train_books)
    vocab = tokenize(text)
    vocab = list(filter(None, set(vocab)))
    
    # `maxlen` is the length of the longest word in the vocabulary
    # plus two SOS and EOS characters.
    maxlen = max([len(token) for token in vocab]) + 2
    train_encoder, train_decoder, train_target = transform(
        vocab, maxlen, error_rate=error_rate, shuffle=False)
    print(train_encoder[:10])
    print(train_decoder[:10])
    print(train_target[:10])

    input_chars = set(' '.join(train_encoder))
    target_chars = set(' '.join(train_decoder))
    nb_input_chars = len(input_chars)
Example #26
0
 def input_text(self): 
     if not self.edition or not self.user:
         raise quit()
     print '\ntype in the review. (exit by typing two continuous line breaks.)\n'
     self.text = utils.read_text()        
     print '\nthanks.'
Example #27
0
    parser.add_argument("--train_w2v",
                        help="train word2vec model from input file",
                        action="store_true")
    parser.add_argument("--train_cnn",
                        help="train CNN model from input labels and data",
                        action="store_true")
    parser.add_argument("--predict_cnn",
                        help="use CNN model to predict labels for test data",
                        action="store_true")
    parser.add_argument("--validate_cnn", action="store_true")

    args = parser.parse_args()
    if args.train_d2v:
        text = []
        for i in args.di:
            text += utils.read_text(i)
        utils.logger.info("text reading finished")
        tokens = utils.tokenize_paragraph_d2v(text)
        utils.logger.info("text tokenizing finished")
        utils.compute_paragraph_doc2vec(tokens,
                                        vector_size=args.vector_size,
                                        epochs=25,
                                        workers=multiprocessing.cpu_count(),
                                        model_path=args.dm)
        utils.logger.info("doc2vec training finished")
    elif args.train_w2v:
        text = []
        for i in args.wi:
            text += utils.read_text(i)
        utils.logger.info("text reading finished")
        tokens = utils.tokenize_paragraph_w2v(text)
    print('\nConverting {} ...'.format(TMP_TEST_PROTOTXT_FILE))
    convert_prototxt()

    print('\Making {} ...'.format(DST_DEPLOY_PROTOTXT_FILE))
    make_deploy_prototxt()

    print('\nPostprocessing {} ...'.format(DST_TEST_PROTOTXT_FILE))
    postprocess_test_prototxt()

    info = {}
    info['weights'] = WEIGHTS_FILE
    info['lmdb_dir'] = LMDB_TARGET_DIR
    info['lmdb_image_count'] = LMDB_IMAGE_COUNT
    info['test_prototxt_f32'] = SRC_TEST_PROTOTXT_FILE
    info['test_prototxt_i8'] = DST_TEST_PROTOTXT_FILE
    info['deploy_prototxt_f32'] = SRC_DEPLOY_PROTOTXT_FILE
    info['deploy_prototxt_i8'] = DST_DEPLOY_PROTOTXT_FILE
    info['label_map_file'] = LABEL_MAP_FILE
    info['name_size_file'] = NAME_SIZE_FILE
    utils.write_json(PREPARED_INFO_FILE, info)

  finally:
    print('\nFinalizing...')
    if caffe_package_init_file_uid:
      if os.path.isfile(caffe_package_init_file):
        existed_uid = utils.read_text(caffe_package_init_file)
        print('{} exists'.format(caffe_package_init_file))
        if existed_uid == caffe_package_init_file_uid:
          os.remove(caffe_package_init_file)
          print('Removed')
    def _read_standoff(self, corpus_dir, encoding="UTF-8"):
        docs = {}

        for filename in glob(os.path.join(corpus_dir, "*.ann")):
            doc = read_text(filename.replace(".ann", ".txt"),
                            encoding=encoding)

            cursor = 0
            start_offsets = {}
            end_offsets = {}

            sentences = []

            for sentence_index, sentence in enumerate(doc.split("\n")):
                tokens = sentence.split(" ")

                for token_index, token in enumerate(tokens):
                    start_offsets[cursor] = (sentence_index, token_index)
                    end_offsets[cursor + len(token)] = (sentence_index,
                                                        token_index)
                    cursor += len(token) + 1

                sentences.append({"tokens": tokens, "mentions": []})

            assert len(doc) == cursor - 1

            mentions = {}
            references = {}

            for line in read_lines(filename, encoding=encoding):
                if line.startswith("T"):
                    matcher = MENTION_PATTERN.match(line)

                    mention_id, mention_label, mention_start_offset, mention_end_offset, mention_string = (
                        matcher.groups())

                    assert mention_id not in mentions

                    if mention_label in self.get_labels():
                        mentions[mention_id] = {
                            "id": mention_id,
                            "label": mention_label,
                            "start": int(mention_start_offset),
                            "end": int(mention_end_offset),
                            "string": mention_string,
                            "references": {},
                        }
                elif line.startswith("N"):
                    matcher = REFERENCE_PATTERN.match(line)

                    reference_id, mention_id, resource_name, record_id, reference_string = (
                        matcher.groups())

                    assert reference_id not in references

                    references[reference_id] = {
                        "id": reference_id,
                        "mention": mention_id,
                        "resource": resource_name,
                        "record": record_id,
                        "string": reference_string,
                    }

            for reference in references.values():
                if reference["mention"].startswith("T"):
                    resource_record_pair = (reference["resource"],
                                            reference["record"])

                    assert (
                        resource_record_pair
                        not in mentions[reference["mention"]]["references"])

                    mentions[reference["mention"]]["references"][
                        resource_record_pair] = reference["string"]

            seen_mentions = defaultdict(dict)

            for mention in mentions.values():
                left_sentence_index, mention_start_offset = start_offsets[
                    mention["start"]]
                right_sentence_index, mention_end_offset = end_offsets[
                    mention["end"]]

                assert (left_sentence_index == right_sentence_index
                        and mention_start_offset <= mention_end_offset
                        and " ".join(sentences[left_sentence_index]["tokens"]
                                     [mention_start_offset:mention_end_offset +
                                      1]) == mention["string"])

                if (
                        mention_start_offset,
                        mention_end_offset,
                        mention["label"],
                ) in seen_mentions[left_sentence_index]:
                    seen_mention = seen_mentions[left_sentence_index][
                        mention_start_offset, mention_end_offset,
                        mention["label"]]

                    assert not (seen_mention["references"]
                                and mention["references"]
                                and seen_mention["references"] !=
                                mention["references"])

                    seen_mention["references"].update(mention["references"])
                else:
                    sentences[left_sentence_index]["mentions"].append({
                        "id":
                        mention["id"],
                        "label":
                        mention["label"],
                        "start":
                        mention_start_offset,
                        "end":
                        mention_end_offset,
                        "references":
                        mention["references"],
                    })

                    seen_mentions[left_sentence_index][
                        mention_start_offset, mention_end_offset,
                        mention["label"]] = mention

            docs[os.path.basename(filename)] = {"sentences": sentences}

        return docs
Example #30
0
File: an.py Project: meng89/nikaya
def make_nikaya(sutra_urls):

    nikaya = MyNikaya()
    nikaya.title_chinese = "增支部"
    nikaya.title_pali = ("Aṅguttara nikāya",)
    nikaya.abbreviation = "AN"

    for url in sutra_urls:

        chinese, pali, modified = read_text(url)

        header_lines, main_lines = split_chinese_lines(chinese)

        info = analyse_header(header_lines)

        if info.ji_serial is not None:
            if not nikaya.jis or nikaya.jis[-1].serial != info.ji_serial:
                ji = Ji()
                ji.serial = info.ji_serial

                nikaya.jis.append(ji)

        if info.pin_serial is not None:
            if not nikaya.jis[-1].pins or nikaya.jis[-1].pins[-1].serial != info.pin_serial:
                pin = Pin()
                pin.serial = info.pin_serial
                pin.title = info.pin_title

                nikaya.jis[-1].pins.append(pin)

        sutra = Sutra()

        sutra.serial_start = info.sutra_serial_start
        sutra.serial_end = info.sutra_serial_end

        sutra.pali = pali
        sutra.chinese = chinese

        sutra.main_lines = main_lines

        sutra.modified = modified

        if sutra.serial_start == sutra.serial_end:
            sutra.serial = sutra.serial_start
        else:
            sutra.serial = "{}-{}".format(sutra.serial_start, sutra.serial_end)

        if info.sutra_title:
            sutra.title = info.sutra_title
        else:
            sutra.title = ""

        if sutra.title:
            sutra.sec_title = sutra.serial + " " + sutra.title
        else:
            sutra.sec_title = sutra.serial

        sutra.abbreviation = "{}.{}.{}".format(nikaya.abbreviation, nikaya.jis[-1].serial, sutra.serial)

        nikaya.jis[-1].pins[-1].sutras.append(sutra)

    return nikaya
Example #31
0
    print("Reading {}...".format(args.inputdir))

    d = {}
    d1 = []
    X = []
    Y = []
    for infile in glob.glob(args.inputdir + '/*/*/*'):
        dic = {}
        instance = os.path.split(os.path.dirname(infile))[-1]
        review_file = open(infile,'r').read()
        X.append(review_file)
        Y.append(instance)
        if instance not in d:
            d[instance] = []
        d[instance].append(review_file)
    X, _ = read_text(X)
    df = pd.DataFrame(X)
    df = df.fillna(0)
    original_author_names = Y.copy()

    Y =  label_encoder(Y)
    # Do what you need to read the documents here.

    print("Constructing table with {} feature dimensions and {}% test instances...".format(args.dims, args.testsize))
    # Build the table here.
    X = reduce_dim(df, args.dims)

    train_X, test_X, train_Y, test_Y, tag = shuffle_split(X, Y, test_split = args.testsize)
    train_X = pd.DataFrame(train_X)
    test_X = pd.DataFrame(test_X)
    train_Y = pd.DataFrame(train_Y)
Example #32
0
    model.add(LSTM(256, dropout=0.1, recurrent_dropout=0.1))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=[
            'accuracy',
        ],
    )
    return model


if __name__ == '__main__':
    text = read_text([FILE_NAME, FILE_NAME2])
    text = preprocess(text, force=True)

    max_words = 100000000
    WINDOW = 4

    tokenizer = Tokenizer(num_words=max_words,
                          filters='"#$%&()*+-/:;<=>@[\]^_`{|}~')
    tokenizer.fit_on_texts(text)

    X_train = tokenizer.texts_to_sequences(text)
    print('Train shape:', np.array(X_train).shape)
    X_train_time, Y_train_time = create_dataset(np.array(X_train), WINDOW)

    vocab_size = len(tokenizer.word_index) + 1
Example #33
0
import os
import unidecode


def read_text(data_path, list_of_books):
    text = ''
    for book in list_of_books:
        file_path = os.path.join(data_path, book)
        strings = unidecode.unidecode(open(file_path).read())
        text += strings + ' '
    return text


test_path = 'E:\\level4\\second\\NLP\\project\\deep-spell-checkr-master\\data'
test_book = ['input.txt']
test_sentence = read_text(test_path, test_book)
true_path = 'E:\\level4\\second\\NLP\\project\\deep-spell-checkr-master\\data'
true_book = ['true.txt']
true_sentences = read_text(true_path, true_book)
string = ''
arr = []
for x in true_sentences:
    if (x == ' '):
        arr.append(string)
        string = ''
    else:
        string = string + x
true = arr
if __name__ == '__main__':
    text = read_text(data_path, books)
    vocab = tokenize(text)
error_rate = 0.6
reverse = True
model_path = './models/seq2seq.h5'
hidden_size = 512
sample_mode = 'argmax'
data_path = './data'
books = [
    'nietzsche.txt', 'pride_and_prejudice.txt', 'shakespeare.txt',
    'war_and_peace.txt'
]

test_sentence = 'The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.'

if __name__ == '__main__':
    text = read_text(data_path, books)
    vocab = tokenize(text)
    vocab = list(filter(None, set(vocab)))
    # `maxlen` is the length of the longest word in the vocabulary
    # plus two SOS and EOS characters.
    maxlen = max([len(token) for token in vocab]) + 2
    train_encoder, train_decoder, train_target = transform(
        vocab, maxlen, error_rate=error_rate, shuffle=False)

    tokens = tokenize(test_sentence)
    tokens = list(filter(None, tokens))
    nb_tokens = len(tokens)
    misspelled_tokens, _, target_tokens = transform(tokens,
                                                    maxlen,
                                                    error_rate=error_rate,
                                                    shuffle=False)
Example #35
0
# -*- coding: utf-8 -*-
# !/usr/bin/env python
'''
@author: Yang
@time: 17-11-14 下午9:18
'''

import os
import utils
import re

import matplotlib.pyplot as plt
import numpy as np

plt.figure()

# , 'underfitting'
Types = ['acceleration']
for dataType in Types:
    dirs = 'model/%s/image/' % (dataType)
    for subdir in os.listdir(dirs):
        logFile = dirs + subdir + '/log.txt'
        logLines = utils.read_text(filename=logFile).strip().split('\n')
        logLines = [re.findall(r'\d+\.?\d*', line) for line in logLines]
        logLines = np.asarray(logLines, dtype=np.float)
        plt.plot(logLines[:, -1], label=logFile)

plt.legend(loc='best')
plt.grid()
plt.show()
Example #36
0
import utils as ut

if __name__ == '__main__':
    lines = ut.read_text("inputs/day1_1.txt")
    digits = lines[0]
    n_digits = len(digits)

    d_sum = 0
    for i in range(n_digits):
        d = digits[i]

        if i == (n_digits - 1):
            d_next = digits[0]
        else:
            d_next = digits[i + 1]

        if int(d) == int(d_next):
            d_sum += int(d)

    print(d_sum)
Example #37
0
File: sn.py Project: meng89/nikaya
def make_nikaya(sutra_urls):

    nikaya = MyNikaya()
    nikaya.title_chinese = '相應部'
    nikaya.title_pali = 'Saṃyutta Nikāya',
    nikaya.abbreviation = 'SN'

    for url in sutra_urls:

        chinese, pali, modified = read_text(url)

        header_lines, main_lines = split_chinese_lines(chinese)

        info = analyse_header(header_lines)

        if info.pian_serial is not None:
            if not nikaya.subs or nikaya.subs[-1].serial != info.pian_serial:
                pian = Pian()
                pian.serial = info.pian_serial
                pian.title = info.pian_title

                nikaya.subs.append(pian)

        if info.xiangying_serial is not None:
            if not nikaya.subs[-1].subs or nikaya.subs[-1].subs[-1].serial != info.xiangying_serial:
                xiangying = XiangYing()
                xiangying.serial = info.xiangying_serial
                xiangying.title = info.xiangying_title

                xiangying.sec_title = '{} {}'.format(xiangying.serial, xiangying.title)

                nikaya.subs[-1].subs.append(xiangying)

        if info.pin_serial is not None:
            if not nikaya.subs[-1].subs[-1].subs or nikaya.subs[-1].subs[-1].subs[-1].serial != info.pin_serial:
                pin = Pin()
                pin.serial = info.pin_serial
                pin.title = info.pin_title

                nikaya.subs[-1].subs[-1].subs.append(pin)

        if not nikaya.pians[-1].xiangyings[-1].pins:
            pin = Pin()
            pin.serial = 1
            pin.title = '(未分品)'
            nikaya.pians[-1].xiangyings[-1].pins.append(pin)

        sutra = Sutra()

        sutra.serial_start = info.sutra_serial_start
        sutra.serial_end = info.sutra_serial_end

        sutra.pali = pali
        sutra.chinese = chinese

        sutra.main_lines = main_lines

        sutra.modified = modified

        if sutra.serial_start == sutra.serial_end:
            sutra.serial = sutra.serial_start
        else:
            sutra.serial = '{}-{}'.format(sutra.serial_start, sutra.serial_end)

        if info.sutra_title:
            sutra.title = info.sutra_title
        else:
            sutra.title = ''

        if sutra.title:
            sutra.sec_title = sutra.serial + ' ' + sutra.title
        else:
            sutra.sec_title = sutra.serial

        sutra.abbreviation = '{}.{}.{}'.format(nikaya.abbreviation,
                                               nikaya.pians[-1].xiangyings[-1].serial,
                                               sutra.serial)

        nikaya.pians[-1].xiangyings[-1].pins[-1].sutras.append(sutra)

    return nikaya