Esempio n. 1
0
def generate_summary(article_sent_tokens, qid_ssi_to_importances, example_idx):
    qid = example_idx

    summary_sent_tokens = []
    summary_tokens = util.flatten_list_of_lists(summary_sent_tokens)
    already_used_source_indices = []
    similar_source_indices_list = []
    summary_sents_for_html = []
    ssi_length_extractive = None

    # Iteratively select a singleton/pair from the article that has the highest score from BERT
    while len(summary_tokens) < 300:
        if len(summary_tokens) >= l_param and ssi_length_extractive is None:
            ssi_length_extractive = len(similar_source_indices_list)
        mmr_dict = util.calc_MMR_source_indices(article_sent_tokens, summary_tokens, None, qid_ssi_to_importances, qid=qid)
        sents, source_indices = get_best_source_sents(article_sent_tokens, mmr_dict, already_used_source_indices)
        if len(source_indices) == 0:
            break
        summary_sent_tokens.extend(sents)
        summary_tokens = util.flatten_list_of_lists(summary_sent_tokens)
        similar_source_indices_list.append(source_indices)
        summary_sents_for_html.append(' <br> '.join([' '.join(sent) for sent in sents]))
        if filter_sentences:
            already_used_source_indices.extend(source_indices)
    if ssi_length_extractive is None:
        ssi_length_extractive = len(similar_source_indices_list)
    selected_article_sent_indices = util.flatten_list_of_lists(similar_source_indices_list[:ssi_length_extractive])
    summary_sents = [' '.join(sent) for sent in util.reorder(article_sent_tokens, selected_article_sent_indices)]
    return summary_sents, similar_source_indices_list, summary_sents_for_html, ssi_length_extractive
def generate_summary(article_sent_tokens, qid_ssi_to_importances, example_idx):
    qid = example_idx

    summary_sent_tokens = []
    summary_tokens = util.flatten_list_of_lists(summary_sent_tokens)
    already_used_source_indices = []
    similar_source_indices_list = []
    summary_sents_for_html = []
    ssi_length_extractive = None
    while len(summary_tokens) < 1000:
        if len(summary_tokens) >= l_param and ssi_length_extractive is None:
            ssi_length_extractive = len(similar_source_indices_list)
        if FLAGS.dataset_name == 'xsum' and len(summary_tokens) > 0:
            ssi_length_extractive = len(similar_source_indices_list)
            break
        mmr_dict = util.calc_MMR_source_indices(article_sent_tokens,
                                                summary_tokens,
                                                None,
                                                qid_ssi_to_importances,
                                                qid=qid)
        sents, source_indices = get_best_source_sents(
            article_sent_tokens, mmr_dict, already_used_source_indices)
        if len(source_indices) == 0:
            break
        summary_sent_tokens.extend(sents)
        summary_tokens = util.flatten_list_of_lists(summary_sent_tokens)
        similar_source_indices_list.append(source_indices)
        summary_sents_for_html.append(' <br> '.join(
            [' '.join(sent) for sent in sents]))
        if filter_sentences:
            already_used_source_indices.extend(source_indices)
    if ssi_length_extractive is None:
        ssi_length_extractive = len(similar_source_indices_list)
    selected_article_sent_indices = util.flatten_list_of_lists(
        similar_source_indices_list[:ssi_length_extractive])
    summary_sents = [
        ' '.join(sent) for sent in util.reorder(article_sent_tokens,
                                                selected_article_sent_indices)
    ]
    # summary = '\n'.join([' '.join(tokens) for tokens in summary_sent_tokens])
    return summary_sents, similar_source_indices_list, summary_sents_for_html, ssi_length_extractive
Esempio n. 3
0
def main():
    config.initializeConfig()
    if len(sys.argv) < 3:
        print 'USAGE:', sys.argv[0], USAGE
        # sys.exit(0)
        myVrp, solutions, myStyleSheet = None, None, None
    else:
        # type of input data we're working on
        toks = sys.argv[1].split(':')
        type = toks[0]
        subtype = toks[1] if len(toks) > 1 else 'default'
        # solutions to load and their subtype
        if len(sys.argv) > 3 and sys.argv[3][0] == ':':
            solutionSubtype = sys.argv[3][1:]
            solutionFileNames = sys.argv[4:]
        elif len(sys.argv) > 2:
            solutionSubtype = 'default'
            solutionFileNames = sys.argv[3:]
        else:
            solutionFileNames = False
        # loader object to load all of this
        loader = loaddata.DataLoader()
        # here we load the data
        myVrp = loader.loadInstance(sys.argv[2], type, subtype)
        # reorder solutions using numbers in file names
        if solutionFileNames:
            solutionFileNames = util.reorder(solutionFileNames)
            solutions = [
                loader.loadSolution(fName, myVrp, type, solutionSubtype)
                for fName in solutionFileNames
            ]
        else:
            solutions = None
        myStyleSheet = loader.loadStyleSheet(type)


#         myStyleSheet = loaddata.stylesheetFromType(type)
#     myStyleSheet = stylesheet.FunkyStyleSheet()
    app = wxgui.vrpgui.VrpGui(myVrp, solutions, myStyleSheet)
    app.MainLoop()
def get_similar_source_sents_recursive(summ_sent, partial_summ_sent, selection, article_sent_tokens, vocab, similarities, depth, sentence_limit, min_matched_tokens):
    if sentence_limit == 1:
        if depth > 2:
            return [[]], [[]], [[]]
    elif len(selection) < 3 or depth >= sentence_limit:      # base case: when summary sentence is too short
        return [[]], [[]], [[]]

    all_sent_indices = []
    all_lcs_paths = []
    all_smooth_article_paths = []

    # partial_summ_sent = util.reorder(summ_sent, selection)
    top_sent_indices, top_similarity = get_top_similar_sent(partial_summ_sent, article_sent_tokens, vocab)
    top_similarities = util.reorder(similarities, top_sent_indices)
    top_sent_indices = [x for _, x in sorted(zip(top_similarities, top_sent_indices), key=lambda pair: pair[0])][::-1]
    for top_sent_idx in top_sent_indices:
        nonstopword_matches, _ = util.matching_unigrams(partial_summ_sent, article_sent_tokens[top_sent_idx], should_remove_stop_words=True)
        lcs_len, (summ_lcs_path, _) = util.matching_unigrams(partial_summ_sent, article_sent_tokens[top_sent_idx])
        smooth_article_path = get_smooth_path(summ_sent, article_sent_tokens[top_sent_idx])
        if len(nonstopword_matches) < min_matched_tokens:
            continue
        leftover_selection = [idx for idx in range(len(partial_summ_sent)) if idx not in summ_lcs_path]
        partial_summ_sent = replace_with_blanks(partial_summ_sent, leftover_selection)

        sent_indices, lcs_paths, smooth_article_paths = get_similar_source_sents_recursive(
            summ_sent, partial_summ_sent, leftover_selection, article_sent_tokens, vocab, similarities, depth+1,
            sentence_limit, min_matched_tokens)   # recursive call

        combined_sent_indices = [[top_sent_idx] + indices for indices in sent_indices]      # append my result to the recursive collection
        combined_lcs_paths = [[summ_lcs_path] + paths for paths in lcs_paths]
        combined_smooth_article_paths = [[smooth_article_path] + paths for paths in smooth_article_paths]

        all_sent_indices.extend(combined_sent_indices)
        all_lcs_paths.extend(combined_lcs_paths)
        all_smooth_article_paths.extend(combined_smooth_article_paths)
    if len(all_sent_indices) == 0:
        return [[]], [[]], [[]]
    return all_sent_indices, all_lcs_paths, all_smooth_article_paths
Esempio n. 5
0
def evaluate_example(ex):
    example, example_idx, qid_ssi_to_importances, _, _ = ex
    print(example_idx)

    # Read example from dataset
    raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example(example, names_to_types)
    article_sent_tokens = [util.process_sent(sent) for sent in raw_article_sents]
    enforced_groundtruth_ssi_list = util.enforce_sentence_limit(groundtruth_similar_source_indices_list, sentence_limit)
    groundtruth_summ_sents = [[sent.strip() for sent in groundtruth_summary_text.strip().split('\n')]]
    groundtruth_summ_sent_tokens = [sent.split(' ') for sent in groundtruth_summ_sents[0]]

    if FLAGS.upper_bound:
        # If upper bound, then get the groundtruth singletons/pairs
        replaced_ssi_list = util.replace_empty_ssis(enforced_groundtruth_ssi_list, raw_article_sents)
        selected_article_sent_indices = util.flatten_list_of_lists(replaced_ssi_list)
        summary_sents = [' '.join(sent) for sent in util.reorder(article_sent_tokens, selected_article_sent_indices)]
        similar_source_indices_list = groundtruth_similar_source_indices_list
        ssi_length_extractive = len(similar_source_indices_list)
    else:
        # Generates summary based on BERT output. This is an extractive summary.
        summary_sents, similar_source_indices_list, summary_sents_for_html, ssi_length_extractive = generate_summary(article_sent_tokens, qid_ssi_to_importances, example_idx)
        similar_source_indices_list_trunc = similar_source_indices_list[:ssi_length_extractive]
        summary_sents_for_html_trunc = summary_sents_for_html[:ssi_length_extractive]
        if example_idx <= 1:
            summary_sent_tokens = [sent.split(' ') for sent in summary_sents_for_html_trunc]
            extracted_sents_in_article_html = html_highlight_sents_in_article(summary_sent_tokens, similar_source_indices_list_trunc,
                                            article_sent_tokens, doc_indices=doc_indices)

            groundtruth_ssi_list, lcs_paths_list, article_lcs_paths_list = get_simple_source_indices_list(
                                            groundtruth_summ_sent_tokens,
                                           article_sent_tokens, None, sentence_limit, min_matched_tokens)
            groundtruth_highlighted_html = html_highlight_sents_in_article(groundtruth_summ_sent_tokens, groundtruth_ssi_list,
                                            article_sent_tokens, lcs_paths_list=lcs_paths_list, article_lcs_paths_list=article_lcs_paths_list, doc_indices=doc_indices)

            all_html = '<u>System Summary</u><br><br>' + extracted_sents_in_article_html + '<u>Groundtruth Summary</u><br><br>' + groundtruth_highlighted_html
            ssi_functions.write_highlighted_html(all_html, html_dir, example_idx)
    rouge_functions.write_for_rouge(groundtruth_summ_sents, summary_sents, example_idx, ref_dir, dec_dir)
    return (groundtruth_similar_source_indices_list, similar_source_indices_list, ssi_length_extractive)
def get_merge_example(similar_source_indices, article_sent_tokens, summ_sent,
                      corefs, article_lcs_paths):
    # restricted_source_indices = []
    # for source_indices_idx, source_indices in enumerate(similar_source_indices):
    #     if source_indices_idx >= FLAGS.sentence_limit:
    #         break
    #     restricted_source_indices.append(source_indices[0])
    if FLAGS.chronological and len(similar_source_indices) > 1:
        if similar_source_indices[0] > similar_source_indices[1]:
            similar_source_indices = (min(similar_source_indices),
                                      max(similar_source_indices))
            article_lcs_paths = (article_lcs_paths[1], article_lcs_paths[0])
    merged_example_sentences = [
        ' '.join(sent)
        for sent in util.reorder(article_sent_tokens, similar_source_indices)
    ]
    merged_example_article_text = ' '.join(merged_example_sentences)
    merged_example_abstracts = [[' '.join(summ_sent)]]
    merge_example = convert_data.make_example(merged_example_article_text,
                                              merged_example_abstracts, None,
                                              merged_example_sentences, corefs,
                                              article_lcs_paths)
    return merge_example
Esempio n. 7
0
def main():
    config.initializeConfig()
    if len(sys.argv) < 3:
        print 'USAGE:', sys.argv[0], USAGE
        # sys.exit(0)
        myVrp, solutions, myStyleSheet = None, None, None
    else:
        # type of input data we're working on
        toks = sys.argv[1].split(':')
        type = toks[0]
        subtype = toks[1] if len(toks) > 1 else 'default'
        # solutions to load and their subtype
        if len(sys.argv) > 3 and sys.argv[3][0] == ':':
            solutionSubtype = sys.argv[3][1:]
            solutionFileNames = sys.argv[4:]
        elif len(sys.argv) > 2:
            solutionSubtype = 'default'
            solutionFileNames = sys.argv[3:]
        else:
            solutionFileNames = False
        # loader object to load all of this
        loader = loaddata.DataLoader()
        # here we load the data
        myVrp = loader.loadInstance(sys.argv[2], type, subtype)
        # reorder solutions using numbers in file names
        if solutionFileNames:
            solutionFileNames = util.reorder(solutionFileNames)
            solutions = [ loader.loadSolution(fName, myVrp,
                                              type, solutionSubtype)
                          for fName in solutionFileNames ]
        else:
            solutions = None
        myStyleSheet = loader.loadStyleSheet(type)
#         myStyleSheet = loaddata.stylesheetFromType(type)
#     myStyleSheet = stylesheet.FunkyStyleSheet()
    app = wxgui.vrpgui.VrpGui(myVrp, solutions, myStyleSheet)
    app.MainLoop()
Esempio n. 8
0
def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    out_dir = os.path.join(
        os.path.expanduser('~') + '/data/kaiqiang_data', FLAGS.dataset_name)
    if FLAGS.mode == 'write':
        util.create_dirs(out_dir)
        if FLAGS.dataset_name == 'duc_2004':
            dataset_splits = ['test']
        elif FLAGS.dataset_split == 'all':
            dataset_splits = ['test', 'val', 'train']
        else:
            dataset_splits = [FLAGS.dataset_split]

        for dataset_split in dataset_splits:

            if dataset_split == 'test':
                ssi_data_path = os.path.join(
                    'logs/%s_bert_both_sentemb_artemb_plushidden' %
                    FLAGS.dataset_name, 'ssi.pkl')
                print(util.bcolors.OKGREEN +
                      "Loading SSI from BERT at %s" % ssi_data_path +
                      util.bcolors.ENDC)
                with open(ssi_data_path) as f:
                    ssi_triple_list = pickle.load(f)

            source_dir = os.path.join(data_dir, FLAGS.dataset_name)
            source_files = sorted(
                glob.glob(source_dir + '/' + dataset_split + '*'))

            total = len(source_files) * 1000 if (
                'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name
                or 'xsum' in FLAGS.dataset_name) else len(source_files)
            example_generator = data.example_generator(
                source_dir + '/' + dataset_split + '*',
                True,
                False,
                should_check_valid=False)

            out_document_path = os.path.join(out_dir,
                                             dataset_split + '.Ndocument')
            out_summary_path = os.path.join(out_dir,
                                            dataset_split + '.Nsummary')
            out_example_idx_path = os.path.join(out_dir,
                                                dataset_split + '.Nexampleidx')

            doc_writer = open(out_document_path, 'w')
            if dataset_split != 'test':
                sum_writer = open(out_summary_path, 'w')
            ex_idx_writer = open(out_example_idx_path, 'w')

            for example_idx, example in enumerate(
                    tqdm(example_generator, total=total)):
                if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances:
                    break
                raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example(
                    example, names_to_types)
                article_sent_tokens = [
                    util.process_sent(sent) for sent in raw_article_sents
                ]
                if FLAGS.dataset_name == 'duc_2004':
                    groundtruth_summ_sents = [[
                        sent.strip()
                        for sent in gt_summ_text.strip().split('\n')
                    ] for gt_summ_text in groundtruth_summary_text]
                else:
                    groundtruth_summ_sents = [[
                        sent.strip() for sent in
                        groundtruth_summary_text.strip().split('\n')
                    ]]
                if doc_indices is None:
                    doc_indices = [0] * len(
                        util.flatten_list_of_lists(article_sent_tokens))
                doc_indices = [int(doc_idx) for doc_idx in doc_indices]
                # rel_sent_indices, _, _ = preprocess_for_lambdamart_no_flags.get_rel_sent_indices(doc_indices, article_sent_tokens)

                if dataset_split == 'test':
                    if example_idx >= len(ssi_triple_list):
                        raise Exception(
                            'Len of ssi list (%d) is less than number of examples (>=%d)'
                            % (len(ssi_triple_list), example_idx))
                    ssi_length_extractive = ssi_triple_list[example_idx][2]
                    if ssi_length_extractive > 1:
                        a = 0
                    ssi = ssi_triple_list[example_idx][1]
                    ssi = ssi[:ssi_length_extractive]
                    groundtruth_similar_source_indices_list = ssi
                else:
                    groundtruth_similar_source_indices_list = util.enforce_sentence_limit(
                        groundtruth_similar_source_indices_list,
                        FLAGS.sentence_limit)

                for ssi_idx, ssi in enumerate(
                        groundtruth_similar_source_indices_list):
                    if len(ssi) == 0:
                        continue
                    my_article = ' '.join(util.reorder(raw_article_sents, ssi))
                    doc_writer.write(my_article + '\n')
                    if dataset_split != 'test':
                        sum_writer.write(groundtruth_summ_sents[0][ssi_idx] +
                                         '\n')
                    ex_idx_writer.write(str(example_idx) + '\n')
    elif FLAGS.mode == 'evaluate':
        summary_dir = '/home/logan/data/kaiqiang_data/logan_ACL/trained_on_' + FLAGS.train_dataset + '/' + FLAGS.dataset_name
        out_summary_path = os.path.join(summary_dir, 'test' + 'Summary.txt')
        out_example_idx_path = os.path.join(out_dir, 'test' + '.Nexampleidx')
        decode_dir = 'logs/kaiqiang_%s_trainedon%s' % (FLAGS.dataset_name,
                                                       FLAGS.train_dataset)
        rouge_ref_dir = os.path.join(decode_dir, 'reference')
        rouge_dec_dir = os.path.join(decode_dir, 'decoded')
        util.create_dirs(rouge_ref_dir)
        util.create_dirs(rouge_dec_dir)

        def num_lines_in_file(file_path):
            with open(file_path) as f:
                num_lines = sum(1 for line in f)
            return num_lines

        def process_example(sents, ex_idx, groundtruth_summ_sents):
            final_decoded_words = []
            for sent in sents:
                final_decoded_words.extend(sent.split(' '))
            rouge_functions.write_for_rouge(groundtruth_summ_sents,
                                            None,
                                            ex_idx,
                                            rouge_ref_dir,
                                            rouge_dec_dir,
                                            decoded_words=final_decoded_words,
                                            log=False)

        num_lines_summary = num_lines_in_file(out_summary_path)
        num_lines_example_indices = num_lines_in_file(out_example_idx_path)
        if num_lines_summary != num_lines_example_indices:
            raise Exception(
                'Num lines summary != num lines example indices: (%d, %d)' %
                (num_lines_summary, num_lines_example_indices))

        source_dir = os.path.join(data_dir, FLAGS.dataset_name)
        example_generator = data.example_generator(source_dir + '/' + 'test' +
                                                   '*',
                                                   True,
                                                   False,
                                                   should_check_valid=False)

        sum_writer = open(out_summary_path)
        ex_idx_writer = open(out_example_idx_path)
        prev_ex_idx = 0
        sents = []

        for line_idx in tqdm(range(num_lines_summary)):
            line = sum_writer.readline()
            ex_idx = int(ex_idx_writer.readline())

            if ex_idx == prev_ex_idx:
                sents.append(line)
            else:
                example = example_generator.next()
                raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example(
                    example, names_to_types)
                if FLAGS.dataset_name == 'duc_2004':
                    groundtruth_summ_sents = [[
                        sent.strip()
                        for sent in gt_summ_text.strip().split('\n')
                    ] for gt_summ_text in groundtruth_summary_text]
                else:
                    groundtruth_summ_sents = [[
                        sent.strip() for sent in
                        groundtruth_summary_text.strip().split('\n')
                    ]]
                process_example(sents, ex_idx, groundtruth_summ_sents)
                prev_ex_idx = ex_idx
                sents = [line]

        example = example_generator.next()
        raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example(
            example, names_to_types)
        if FLAGS.dataset_name == 'duc_2004':
            groundtruth_summ_sents = [[
                sent.strip() for sent in gt_summ_text.strip().split('\n')
            ] for gt_summ_text in groundtruth_summary_text]
        else:
            groundtruth_summ_sents = [[
                sent.strip()
                for sent in groundtruth_summary_text.strip().split('\n')
            ]]
        process_example(sents, ex_idx, groundtruth_summ_sents)

        print("Now starting ROUGE eval...")
        if FLAGS.dataset_name == 'xsum':
            l_param = 100
        else:
            l_param = 100
        results_dict = rouge_functions.rouge_eval(rouge_ref_dir,
                                                  rouge_dec_dir,
                                                  l_param=l_param)
        rouge_functions.rouge_log(results_dict, decode_dir)

    else:
        raise Exception('mode flag was not evaluate or write.')
Esempio n. 9
0
def result(img):
    width = 700
    height = 700
    img = cv2.resize(img, (width, height))
    img_cont = img.copy()
    img_cp = img.copy()
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img_blur = cv2.GaussianBlur(img_gray, (5, 5), 1)
    img_edge = cv2.Canny(img_blur, 1, 30)

    contours, hierarchy = cv2.findContours(img_edge, cv2.RETR_EXTERNAL,
                                           cv2.CHAIN_APPROX_NONE)
    cv2.drawContours(img_cont, contours, -1, (0, 255, 0), 5)

    r = util.rectContour(contours)

    rlist = []
    #drawing rectangular contours on img_cp
    for i in range(len(r)):
        rc = util.getCornerPoints(r[i])
        cv2.drawContours(img_cp, r[i], -1, (255, 0, 0), 5)
        rc = util.reorder(rc)
        rlist.append(rc)

    #11-14
    pts_1 = np.float32(rlist[0])
    pts_2 = np.float32([[0, 0], [width, 0], [0, height], [width, height]])
    matrix = cv2.getPerspectiveTransform(pts_1,
                                         pts_2)  # GET TRANSFORMATION MATRIX
    imgWarpColored_details = cv2.warpPerspective(img, matrix, (width, 1000))
    r0 = cv2.cvtColor(imgWarpColored_details, cv2.COLOR_BGR2GRAY)
    r01 = r0[124:700, 29:121]
    r01_thresh = cv2.threshold(r01, 100, 255, cv2.THRESH_BINARY_INV)[1]
    box_r01 = util.splitBoxes(r01_thresh, 4, 4)
    ans11_14 = util.getArray(4, 4, box_r01)

    #15-18
    r02 = r0[124:700, 171:263]
    r02_thresh = cv2.threshold(r02, 100, 255, cv2.THRESH_BINARY_INV)[1]
    box_r02 = util.splitBoxes(r02_thresh, 4, 4)
    ans15_18 = util.getArray(4, 4, box_r02)

    #19-22
    r03 = r0[124:700, 313:405]
    r03_thresh = cv2.threshold(r03, 100, 255, cv2.THRESH_BINARY_INV)[1]
    box_r03 = util.splitBoxes(r03_thresh, 4, 4)
    ans19_22 = util.getArray(4, 4, box_r03)

    #23-26
    r04 = r0[124:700, 455:547]
    r04_thresh = cv2.threshold(r04, 100, 255, cv2.THRESH_BINARY_INV)[1]
    box_r04 = util.splitBoxes(r04_thresh, 4, 4)
    ans23_26 = util.getArray(4, 4, box_r04)

    #27-30
    r05 = r0[124:700, 597:689]
    r05_thresh = cv2.threshold(r05, 100, 255, cv2.THRESH_BINARY_INV)[1]
    box_r05 = util.splitBoxes(r05_thresh, 4, 4)
    ans27_30 = util.getArray(4, 4, box_r05)

    #information
    pts_11 = np.float32(rlist[1])
    pts_21 = np.float32([[0, 0], [width, 0], [0, height], [width, height]])
    matrix = cv2.getPerspectiveTransform(pts_11,
                                         pts_21)  # GET TRANSFORMATION MATRIX
    imgWarpColored_details1 = cv2.warpPerspective(img, matrix, (width, 1000))
    r1 = cv2.cvtColor(imgWarpColored_details1, cv2.COLOR_BGR2GRAY)

    #enrollment_no
    r11 = r1[140:680, 50:400]
    r11_thresh = cv2.threshold(r11, 100, 255, cv2.THRESH_BINARY_INV)[1]
    box_r11 = util.splitBoxes(r11_thresh, 10, 10)
    enrollment_no = np.array(util.getArray(10, 10, box_r11))
    en = enrollment_no.T
    enl = []
    for i in range(10):
        enl.append(str((np.argmax(en[i]) + 1) % 10))
        eno = "".join(enl)

    #test_id
    r12 = r1[140:680, 500:690]
    r12_thresh = cv2.threshold(r12, 100, 255, cv2.THRESH_BINARY_INV)[1]
    box_r12 = util.splitBoxes(r12_thresh, 10, 5)
    test_id = np.array(util.getArray(10, 5, box_r12))
    tid = test_id.T
    til = []
    for i in range(5):
        til.append(str((np.argmax(tid[i]) + 1) % 10))
    tids = "".join(til)

    pts_13 = np.float32(rlist[2])
    pts_23 = np.float32([[0, 0], [width, 0], [0, height], [width, height]])
    matrix = cv2.getPerspectiveTransform(pts_13,
                                         pts_23)  # GET TRANSFORMATION MATRIX
    imgWarpColored_details2 = cv2.warpPerspective(img, matrix, (width, 1000))
    r2 = cv2.cvtColor(imgWarpColored_details2, cv2.COLOR_BGR2GRAY)

    #1-5
    r21 = r2[190:690, 120:320]
    r21_thresh = cv2.threshold(r21, 100, 255, cv2.THRESH_BINARY_INV)[1]
    box_r21 = util.splitBoxes(r21_thresh, 5, 4)
    ans1_5 = util.getArray(5, 4, box_r21)
    #print(ans1_5)

    #6-10
    r22 = r2[190:690, 420:620]
    r22_thresh = cv2.threshold(r22, 100, 255, cv2.THRESH_BINARY_INV)[1]
    box_r22 = util.splitBoxes(r22_thresh, 5, 4)
    ans6_10 = util.getArray(5, 4, box_r22)
    #tmp = np.array(ans6_10)
    #for i in range(5):
    #    print(np.argmax(tmp[i]))
    #print(ans6_10)

    a1 = np.array(ans1_5)
    a2 = np.array(ans6_10)
    a3 = np.array(ans11_14)
    a4 = np.array(ans15_18)
    a5 = np.array(ans19_22)
    a6 = np.array(ans23_26)
    a7 = np.array(ans27_30)

    options = ['a', 'b', 'c', 'd']
    answers = []
    for i in range(5):
        if (np.max(a1[i]) != 0):
            answers.append(options[np.argmax(a1[i])])
        else:
            answers.append('x')
    for i in range(5):
        if (np.max(a2[i]) != 0):
            answers.append(options[np.argmax(a2[i])])
        else:
            answers.append('x')

    for i in range(4):
        if (np.max(a3[i]) != 0):
            answers.append(options[np.argmax(a3[i])])
        else:
            answers.append('x')

    for i in range(4):
        if (np.max(a4[i]) != 0):
            answers.append(options[np.argmax(a4[i])])
        else:
            answers.append('x')

    for i in range(4):
        if (np.max(a5[i]) != 0):
            answers.append(options[np.argmax(a5[i])])
        else:
            answers.append('x')

    for i in range(4):
        if (np.max(a6[i]) != 0):
            answers.append(options[np.argmax(a6[i])])
        else:
            answers.append('x')

    for i in range(4):
        if (np.max(a7[i]) != 0):
            answers.append(options[np.argmax(a7[i])])
        else:
            answers.append('x')
    return eno, tids, answers
    def decode_iteratively(self, example_generator, total, names_to_types,
                           ssi_list, hps):
        for example_idx, example in enumerate(
                tqdm(example_generator, total=total)):
            raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text = util.unpack_tf_example(
                example, names_to_types)
            article_sent_tokens = [
                util.process_sent(sent) for sent in raw_article_sents
            ]
            groundtruth_summ_sents = [[
                sent.strip()
                for sent in groundtruth_summary_text.strip().split('\n')
            ]]

            if ssi_list is None:  # this is if we are doing the upper bound evaluation (ssi_list comes straight from the groundtruth)
                sys_ssi = groundtruth_similar_source_indices_list
                if FLAGS.singles_and_pairs == 'singles':
                    sys_ssi = util.enforce_sentence_limit(sys_ssi, 1)
                elif FLAGS.singles_and_pairs == 'both':
                    sys_ssi = util.enforce_sentence_limit(sys_ssi, 2)
                sys_ssi = util.replace_empty_ssis(sys_ssi, raw_article_sents)
            else:
                gt_ssi, sys_ssi, ext_len = ssi_list[example_idx]
                if FLAGS.singles_and_pairs == 'singles':
                    sys_ssi = util.enforce_sentence_limit(sys_ssi, 1)
                    groundtruth_similar_source_indices_list = util.enforce_sentence_limit(
                        groundtruth_similar_source_indices_list, 1)
                    gt_ssi = util.enforce_sentence_limit(gt_ssi, 1)
                elif FLAGS.singles_and_pairs == 'both':
                    sys_ssi = util.enforce_sentence_limit(sys_ssi, 2)
                    groundtruth_similar_source_indices_list = util.enforce_sentence_limit(
                        groundtruth_similar_source_indices_list, 2)
                    gt_ssi = util.enforce_sentence_limit(gt_ssi, 2)
                if gt_ssi != groundtruth_similar_source_indices_list:
                    print(
                        'Warning: Example %d has different groundtruth source indices: '
                        + str(groundtruth_similar_source_indices_list) +
                        ' || ' + str(gt_ssi))
                if FLAGS.dataset_name == 'xsum':
                    sys_ssi = [sys_ssi[0]]

            final_decoded_words = []
            final_decoded_outpus = ''
            best_hyps = []
            highlight_html_total = ''
            for ssi_idx, ssi in enumerate(sys_ssi):
                selected_raw_article_sents = util.reorder(
                    raw_article_sents, ssi)
                selected_article_text = ' '.join([
                    ' '.join(sent)
                    for sent in util.reorder(article_sent_tokens, ssi)
                ])
                selected_doc_indices_str = '0 ' * len(
                    selected_article_text.split())
                if FLAGS.upper_bound:
                    selected_groundtruth_summ_sent = [[
                        groundtruth_summ_sents[0][ssi_idx]
                    ]]
                else:
                    selected_groundtruth_summ_sent = groundtruth_summ_sents

                batch = create_batch(selected_article_text,
                                     selected_groundtruth_summ_sent,
                                     selected_doc_indices_str,
                                     selected_raw_article_sents,
                                     FLAGS.batch_size, hps, self._vocab)

                decoded_words, decoded_output, best_hyp = decode_example(
                    self._sess, self._model, self._vocab, batch, example_idx,
                    hps)
                best_hyps.append(best_hyp)
                final_decoded_words.extend(decoded_words)
                final_decoded_outpus += decoded_output

                if example_idx < 1000:
                    min_matched_tokens = 2
                    selected_article_sent_tokens = [
                        util.process_sent(sent)
                        for sent in selected_raw_article_sents
                    ]
                    highlight_summary_sent_tokens = [decoded_words]
                    highlight_ssi_list, lcs_paths_list, highlight_smooth_article_lcs_paths_list = ssi_functions.get_simple_source_indices_list(
                        highlight_summary_sent_tokens,
                        selected_article_sent_tokens, None, 2,
                        min_matched_tokens)
                    highlighted_html = ssi_functions.html_highlight_sents_in_article(
                        highlight_summary_sent_tokens,
                        highlight_ssi_list,
                        selected_article_sent_tokens,
                        lcs_paths_list=lcs_paths_list,
                        article_lcs_paths_list=
                        highlight_smooth_article_lcs_paths_list)
                    highlight_html_total += '<u>System Summary</u><br><br>' + highlighted_html + '<br><br>'

                if len(final_decoded_words) >= 100:
                    break

            if example_idx < 1000:
                self.write_for_human(raw_article_sents, groundtruth_summ_sents,
                                     final_decoded_words, example_idx)
                ssi_functions.write_highlighted_html(highlight_html_total,
                                                     self._highlight_dir,
                                                     example_idx)

            rouge_functions.write_for_rouge(
                groundtruth_summ_sents,
                None,
                example_idx,
                self._rouge_ref_dir,
                self._rouge_dec_dir,
                decoded_words=final_decoded_words,
                log=False
            )  # write ref summary and decoded summary to file, to eval with pyrouge later
            example_idx += 1  # this is how many examples we've decoded

        logging.info("Decoder has finished reading dataset for single_pass.")
        logging.info("Output has been saved in %s and %s.",
                     self._rouge_ref_dir, self._rouge_dec_dir)
        if len(os.listdir(self._rouge_ref_dir)) != 0:
            l_param = 100
            logging.info("Now starting ROUGE eval...")
            results_dict = rouge_functions.rouge_eval(self._rouge_ref_dir,
                                                      self._rouge_dec_dir,
                                                      l_param=l_param)
            rouge_functions.rouge_log(results_dict, self._decode_dir)
Esempio n. 11
0
while True:
    if webCam: success, img = cap.read()
    else: img = cv.imread(path)
    img = cv.resize(img, (0, 0), None, 0.5, 0.5)
    img, finalContours = util.genricGetContours(img, minArea=50000, filterr=4)
    if len(finalContours) != 0:
        biggest = finalContours[0][2]
        imgWarp = util.wrapImage(img, biggest, wP, hP, 40)
        img2, Contours2 = util.genricGetContours(imgWarp,
                                                 minArea=1000,
                                                 filterr=4,
                                                 cThr=[50, 50])
        if len(Contours2) != 0:
            for obj in Contours2:
                #cv.polylines(img2,[obj[2]],True,(0,255,255),2)
                nPoints = util.reorder(obj[2])
                newWidth = round((util.findDistance(
                    nPoints[0][0] // scale, nPoints[1][0] // scale) / 10), 1)
                newHeight = round((util.findDistance(
                    nPoints[0][0] // scale, nPoints[2][0] // scale) / 10), 1)
                cv.arrowedLine(img2, (nPoints[0][0][0], nPoints[0][0][1]),
                               (nPoints[1][0][0], nPoints[1][0][1]),
                               (255, 0, 255), 3, 8, 0, 0.05)
                cv.arrowedLine(img2, (nPoints[0][0][0], nPoints[0][0][1]),
                               (nPoints[2][0][0], nPoints[2][0][1]),
                               (255, 0, 255), 3, 8, 0, 0.05)
                x, y, w, h = obj[3]
                cv.putText(img2, '{}cm'.format(newWidth), (x + 30, y - 10),
                           cv.FONT_HERSHEY_COMPLEX, 2, (255, 0, 255), 2)
                cv.putText(img2, '{}cm'.format(newHeight),
                           (x - 70, y + h // 2), cv.FONT_HERSHEY_COMPLEX, 2,
def evaluate_example(ex):
    example, example_idx, qid_ssi_to_importances, qid_ssi_to_token_scores_and_mappings = ex
    print(example_idx)
    # example_idx += 1
    qid = example_idx
    raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example(
        example, names_to_types)
    article_sent_tokens = [
        util.process_sent(sent) for sent in raw_article_sents
    ]
    enforced_groundtruth_ssi_list = util.enforce_sentence_limit(
        groundtruth_similar_source_indices_list, sentence_limit)
    groundtruth_summ_sent_tokens = []
    groundtruth_summ_sents = [[
        sent.strip() for sent in groundtruth_summary_text.strip().split('\n')
    ]]
    groundtruth_summ_sent_tokens = [
        sent.split(' ') for sent in groundtruth_summ_sents[0]
    ]

    if FLAGS.upper_bound:
        replaced_ssi_list = util.replace_empty_ssis(
            enforced_groundtruth_ssi_list, raw_article_sents)
        selected_article_sent_indices = util.flatten_list_of_lists(
            replaced_ssi_list)
        summary_sents = [
            ' '.join(sent) for sent in util.reorder(
                article_sent_tokens, selected_article_sent_indices)
        ]
        similar_source_indices_list = groundtruth_similar_source_indices_list
        ssi_length_extractive = len(similar_source_indices_list)
    else:
        summary_sents, similar_source_indices_list, summary_sents_for_html, ssi_length_extractive, \
            article_lcs_paths_list, token_probs_list = generate_summary(article_sent_tokens, qid_ssi_to_importances, example_idx, qid_ssi_to_token_scores_and_mappings)
        similar_source_indices_list_trunc = similar_source_indices_list[:
                                                                        ssi_length_extractive]
        summary_sents_for_html_trunc = summary_sents_for_html[:
                                                              ssi_length_extractive]
        if example_idx < 100 or (example_idx >= 2000 and example_idx < 2100):
            summary_sent_tokens = [
                sent.split(' ') for sent in summary_sents_for_html_trunc
            ]
            if FLAGS.tag_tokens and FLAGS.tag_loss_wt != 0:
                lcs_paths_list_param = copy.deepcopy(article_lcs_paths_list)
            else:
                lcs_paths_list_param = None
            extracted_sents_in_article_html = html_highlight_sents_in_article(
                summary_sent_tokens,
                similar_source_indices_list_trunc,
                article_sent_tokens,
                doc_indices=doc_indices,
                lcs_paths_list=lcs_paths_list_param)
            # write_highlighted_html(extracted_sents_in_article_html, html_dir, example_idx)

            groundtruth_ssi_list, gt_lcs_paths_list, gt_article_lcs_paths_list, gt_smooth_article_paths_list = get_simple_source_indices_list(
                groundtruth_summ_sent_tokens, article_sent_tokens, None,
                sentence_limit, min_matched_tokens)
            groundtruth_highlighted_html = html_highlight_sents_in_article(
                groundtruth_summ_sent_tokens,
                groundtruth_ssi_list,
                article_sent_tokens,
                lcs_paths_list=gt_lcs_paths_list,
                article_lcs_paths_list=gt_smooth_article_paths_list,
                doc_indices=doc_indices)

            all_html = '<u>System Summary</u><br><br>' + extracted_sents_in_article_html + '<u>Groundtruth Summary</u><br><br>' + groundtruth_highlighted_html
            # all_html = '<u>System Summary</u><br><br>' + extracted_sents_in_article_html
            write_highlighted_html(all_html, html_dir, example_idx)
    rouge_functions.write_for_rouge(groundtruth_summ_sents, summary_sents,
                                    example_idx, ref_dir, dec_dir)
    return (groundtruth_similar_source_indices_list,
            similar_source_indices_list, ssi_length_extractive,
            token_probs_list)
def evaluate_example(ex):
    example, example_idx, qid_ssi_to_importances, _, _ = ex
    print(example_idx)
    # example_idx += 1
    qid = example_idx
    raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example(
        example, names_to_types)
    article_sent_tokens = [
        util.process_sent(sent) for sent in raw_article_sents
    ]
    enforced_groundtruth_ssi_list = util.enforce_sentence_limit(
        groundtruth_similar_source_indices_list, sentence_limit)
    if FLAGS.dataset_name == 'duc_2004':
        groundtruth_summ_sents = [[
            sent.strip() for sent in gt_summ_text.strip().split('\n')
        ] for gt_summ_text in groundtruth_summary_text]
    else:
        groundtruth_summ_sents = [[
            sent.strip()
            for sent in groundtruth_summary_text.strip().split('\n')
        ]]
    groundtruth_summ_sent_tokens = [
        sent.split(' ') for sent in groundtruth_summ_sents[0]
    ]

    if FLAGS.upper_bound:
        replaced_ssi_list = util.replace_empty_ssis(
            enforced_groundtruth_ssi_list, raw_article_sents)
        selected_article_sent_indices = util.flatten_list_of_lists(
            replaced_ssi_list)
        summary_sents = [
            ' '.join(sent) for sent in util.reorder(
                article_sent_tokens, selected_article_sent_indices)
        ]
        similar_source_indices_list = groundtruth_similar_source_indices_list
        ssi_length_extractive = len(similar_source_indices_list)
    elif FLAGS.lead:
        lead_ssi_list = [(idx, ) for idx in list(
            range(util.average_sents_for_dataset[FLAGS.dataset_name]))]
        lead_ssi_list = lead_ssi_list[:len(
            raw_article_sents
        )]  # make sure the sentence indices don't go past the total number of sentences in the article
        selected_article_sent_indices = util.flatten_list_of_lists(
            lead_ssi_list)
        summary_sents = [
            ' '.join(sent) for sent in util.reorder(
                article_sent_tokens, selected_article_sent_indices)
        ]
        similar_source_indices_list = lead_ssi_list
        ssi_length_extractive = len(similar_source_indices_list)
    else:
        summary_sents, similar_source_indices_list, summary_sents_for_html, ssi_length_extractive = generate_summary(
            article_sent_tokens, qid_ssi_to_importances, example_idx)
        similar_source_indices_list_trunc = similar_source_indices_list[:
                                                                        ssi_length_extractive]
        summary_sents_for_html_trunc = summary_sents_for_html[:
                                                              ssi_length_extractive]
        if example_idx <= 100:
            summary_sent_tokens = [
                sent.split(' ') for sent in summary_sents_for_html_trunc
            ]
            extracted_sents_in_article_html = html_highlight_sents_in_article(
                summary_sent_tokens,
                similar_source_indices_list_trunc,
                article_sent_tokens,
                doc_indices=doc_indices)
            # write_highlighted_html(extracted_sents_in_article_html, html_dir, example_idx)

            groundtruth_ssi_list, lcs_paths_list, article_lcs_paths_list = get_simple_source_indices_list(
                groundtruth_summ_sent_tokens, article_sent_tokens, None,
                sentence_limit, min_matched_tokens)
            groundtruth_highlighted_html = html_highlight_sents_in_article(
                groundtruth_summ_sent_tokens,
                groundtruth_ssi_list,
                article_sent_tokens,
                lcs_paths_list=lcs_paths_list,
                article_lcs_paths_list=article_lcs_paths_list,
                doc_indices=doc_indices)
            all_html = '<u>System Summary</u><br><br>' + extracted_sents_in_article_html + '<u>Groundtruth Summary</u><br><br>' + groundtruth_highlighted_html
            write_highlighted_html(all_html, html_dir, example_idx)
    rouge_functions.write_for_rouge(groundtruth_summ_sents, summary_sents,
                                    example_idx, ref_dir, dec_dir)
    return (groundtruth_similar_source_indices_list,
            similar_source_indices_list, ssi_length_extractive)
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 15 01:03:18 2017

@author: ujjaldas223
"""

from util import reorder

print reorder('What were you doing yesterday')
print reorder('When are you going to school')
print reorder('I am going to school')
Esempio n. 15
0
def surrounding_points(point: tuple, delta: float, data: list,
                       reordered_data: list) -> [[]]:
    """ Given a point, a delta, and the set of points to work with, return
    a list of tuples containing all surrounding points.
    This should work for both bases and demands. """
    x = point[0]
    y = point[1]

    index_of_center = 0
    count_found = 0
    print_found = True
    # print (point)

    # let's find the surrounding x coordinates
    if reordered_data:
        x_ordered = reordered_data
    else:
        x_ordered = util.reorder(data, 0)

    for i in range(len(x_ordered)):
        if x_ordered[i][0] == x:
            index_of_center = i
            break

    if not index_of_center:
        # print(x_ordered)
        while x_ordered[index_of_center][0] < x:
            if index_of_center == len(x_ordered) - 1: break
            # print(index_of_center,x_ordered[index_of_center][0],x )
            index_of_center += 1
        #print(index_of_center)
        i = index_of_center

    # Go forwards and backwards. For each point, calculate the distance.
    # If the distance exceeds the delta, then stop
    curr_x = index_of_center + 1
    all_the_surrounding_points = []
    while True:

        # Check that only the x diff > delta in x
        if curr_x < 0 or curr_x >= len(x_ordered):
            break
        # Check if the x_edge is delta away
        if util.dist(x_ordered[i],
                     (x_ordered[curr_x][0], x_ordered[i][1])) > delta:
            break

        distance = util.dist((x, y), x_ordered[curr_x])
        count_found = 0
        if distance < delta:

            if print_found:
                print("Found: ", (x, y), x_ordered[curr_x], "\tDistance:\t",
                      distance)
            count_found += 1
            all_the_surrounding_points.append(x_ordered[curr_x])

        curr_x += 1

    curr_x = index_of_center - 1
    #Identical loop, but going the other direction
    while True:
        if curr_x < 0 or curr_x >= len(x_ordered):
            break
        if util.dist(x_ordered[i],
                     (x_ordered[curr_x][0], x_ordered[i][1])) > delta:
            break
        distance = util.dist(x_ordered[i], x_ordered[curr_x])

        if distance < delta:
            if print_found:
                print("Found: ", x_ordered[i], x_ordered[curr_x],
                      "\tDistance:\t", distance)
            count_found += 1
            all_the_surrounding_points.append(x_ordered[curr_x])
        curr_x -= 1

    print(count_found, (x, y), all_the_surrounding_points)
    print("\n")

    return all_the_surrounding_points
def main():
    #============================================================================
    #--1.Preparation for adaboost learning framework with RF as base learner.
    maxnum_iters = 10
    filter_idx_set = set([11, 14, 64, 16])
    paths = json.loads(open("SETTINGS.json").read())

    print("Getting features for deleted papers from the disk files")
    features_conf = [feature for feature in \
            csv.reader(open(paths["trainpos_features"]))]
    features_deleted = [feature for feature in \
            csv.reader(open(paths["trainneg_features"]))]
    train_author_confirmed = get_train_confirmed([pair[:2] for pair in features_conf])
    train_features = [map(float, x[2:]) for x in features_deleted + features_conf]
    train_author_paper_ids = [x[:2] for x in features_deleted + features_conf]
    train_features = feature_selection(train_features, filter_idx_set)
    train_target = [0 for x in range(len(features_deleted))] + \
            [1 for x in range(len(features_conf))]
    train_labels = np.array([-1 for x in range(len(features_deleted))] + \
            [1 for x in range(len(features_conf))])

    features_valid = [feature for feature in \
            csv.reader(open(paths["vali_features"]))]
    test_features = [map(float, x[2:]) for x in features_valid]
    test_features = feature_selection(test_features, filter_idx_set)
    test_author_confirmed = get_confirmed_paper(paths["vali_solution"])
    test_author_paper_ids = [x[:2] for x in features_valid]
    #============================================================================

    #============================================================================
    #--2.Start of adaboost learning framework
    # initialization importance distribution of data points
    trdata_importance = np.array([1.0/len(train_target) for i in\
            range(len(train_target))])
    model_weights = np.array([0.0 for i in range(maxnum_iters)])
    classifier_set = []

    #############################################################################
    #--Hyperparameter tuning: the best number of base learners
    max_map_val = 0.0
    bestnum_baselearner_map = 0
    #############################################################################
    print("Start adaboost learning loops.")
    for i in range(maxnum_iters):
        classifier = RandomForestClassifier(n_estimators=50,
                                            verbose=2,
                                            n_jobs=4,
                                            min_samples_split=10,
                                            random_state=1)
        if i == 0:
            classifier.fit(train_features, train_target)
        else:
            classifier.fit(train_features, train_target, trdata_importance)
        train_predictions = classifier.predict_proba(train_features)[:,1]
        classifier_set.append(classifier)

        ########################################################################
        #--Hyperparameter tuning: the best number of base learners
        # the first method to calculate error rate: absolute value difference
        error_rate = np.dot(trdata_importance,\
                np.abs(train_target-train_predictions))
        # the second method: negative MAP value
        '''author_predictions = defaultdict(list)
        paper_predictions = {}
        for (a_id, p_id), pred, label in zip(train_author_paper_ids,\
                train_predictions, train_target):
            author_predictions[int(a_id)].append([pred, int(p_id), label])

        for author_id in sorted(author_predictions):
            author_predictions[author_id] = reorder(author_predictions[author_id])
            paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
            paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]
        map_val = calcMAP(paper_predictions, train_author_confirmed)
        error_rate = 1 - map_val'''
        # the third method: approximate misclassification error
        '''delta = 0.05
        error_rate = np.dot(trdata_importance,\
                np.abs(train_target-train_predictions)>delta)'''
        ########################################################################

        print "error rate: %f" % error_rate
        model_weights[i] = 1.0/5*np.log((1.0-error_rate)/error_rate)
        print model_weights[i]
        raw_input()
        #model_weights = model_weights / np.sum(model_weights)
        conv_predictions = np.array([(pred-0.5)*2 for pred in train_predictions])
        #for j in range(len(conv_predictions)):
        #    if conv_predictions[j] > 0:
        #        conv_predictions[j] = 1
        #    else:
        #        conv_predictions[j] = -1
        trdata_importance = trdata_importance*np.exp(-model_weights[i]*\
                train_labels*conv_predictions)
        trdata_importance = trdata_importance/np.sum(trdata_importance)

        ########################################################################
        #--Hyperparameter tuning: the best number of base learners
        test_predictions = np.array([0.0 for j in range(len(train_target))])
        for j in range(i+1):
            test_predictions = [pred1+pred2 for pred1,pred2 in\
                    zip(test_predictions, model_weights[j]*\
                    classifier_set[j].predict_proba(test_features)[:,1])]

        author_predictions = defaultdict(list)
        paper_predictions = {}
        for (a_id, p_id), pred in zip(test_author_paper_ids, test_predictions):
            if p_id in test_author_confirmed[int(a_id)]:
                author_predictions[int(a_id)].append([pred, int(p_id), 1])
            else:
                author_predictions[int(a_id)].append([pred, int(p_id), 0])

        for author_id in sorted(author_predictions):
            author_predictions[author_id] = reorder(author_predictions[author_id])
            paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
            paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

        print("Test the prediction results with MAP metric.")
        map_val = calcMAP(paper_predictions, test_author_confirmed)
        print "Iteration #%d: MAP value --> %f" % (i+1, map_val)
        if map_val > max_map_val:
            max_map_val = map_val
            bestnum_baselearner_map = i+1
        print "Best MAP value --> %f, best number of learners --> %d\n"\
                % (max_map_val, bestnum_baselearner_map)
        #raw_input()
        ########################################################################

    model_weights = model_weights / np.sum(model_weights)
    #============================================================================

    #============================================================================
    #--3.Prediction results on test data
    test_predictions = np.dot(model_weights,\
            np.array([classifier.predict_proba(test_features)[:,1] for\
            classifier in classifier_set]))

    author_predictions = defaultdict(list)
    paper_predictions = {}
    for (a_id, p_id), pred in zip(test_author_paper_ids, test_predictions):
        if p_id in test_author_confirmed[int(a_id)]:
            author_predictions[int(a_id)].append([pred, int(p_id), 1])
        else:
            author_predictions[int(a_id)].append([pred, int(p_id), 0])

    for author_id in sorted(author_predictions):
        author_predictions[author_id] = reorder(author_predictions[author_id])
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Test the prediction results with MAP metric.")
    map_val = calcMAP(paper_predictions, test_author_confirmed)
    print("Final MAP value: %f" % map_val)
Esempio n. 17
0
    def fill_example_queue(self):
        """Reads data from file and processes into Examples which are then placed into the example queue."""

        if self._example_generator is None:
            input_gen = self.text_generator(
                data.example_generator(self._data_path,
                                       self._single_pass,
                                       self._cnn_500_dm_500,
                                       is_original=('with_coref'
                                                    not in self._data_path)))
        else:
            input_gen = self.text_generator(self._example_generator)
        if self._hps.pg_mmr and self._hps.ssi_data_path != '':  # if use pg_mmr and bert
            print(util.bcolors.OKGREEN + "Loading SSI from BERT at %s" %
                  os.path.join(self._hps.ssi_data_path, 'ssi.pkl') +
                  util.bcolors.ENDC)
            with open(os.path.join(self._hps.ssi_data_path, 'ssi.pkl')) as f:
                ssi_triple_list = pickle.load(f)
                # ssi_list = [ssi_triple[1] for ssi_triple in ssi_triple_list]
        else:
            ssi_triple_list = None
        counter = 0
        while True:
            try:
                (
                    article, abstracts, doc_indices_str, raw_article_sents,
                    ssi, article_lcs_paths_list
                ) = next(
                    input_gen
                )  # read the next example from file. article and abstract are both strings.
            except StopIteration:  # if there are no more examples:
                logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    if ssi_triple_list is not None and counter < len(
                            ssi_triple_list):
                        raise Exception(
                            'Len of ssi list (%d) is greater than number of examples (%d)'
                            % (len(ssi_triple_list), counter))
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )
            if ssi_triple_list is not None:
                if counter >= len(ssi_triple_list):
                    raise Exception(
                        'Len of ssi list (%d) is less than number of examples (>=%d)'
                        % (len(ssi_triple_list), counter))
                ssi_length_extractive = ssi_triple_list[counter][2]
                ssi = ssi_triple_list[counter][1]
                ssi = ssi[:ssi_length_extractive]

            article = article
            abstracts = [abstract for abstract in abstracts]
            if type(doc_indices_str) != str:
                doc_indices_str = doc_indices_str
            raw_article_sents = [sent for sent in raw_article_sents]

            all_abstract_sentences = [[
                sent.strip() for sent in data.abstract2sents(abstract)
            ] for abstract in abstracts]
            if len(all_abstract_sentences) != 0:
                abstract_sentences = all_abstract_sentences[0]
            else:
                abstract_sentences = []
            doc_indices = [int(idx) for idx in doc_indices_str.strip().split()]
            # join_separator = ' [SEP] ' if self._hps.sep else ' '
            if self._hps.by_instance:  # if we are running iteratively on only instances (a singleton/pair + a summary sentence), not the whole article
                for abs_idx, abstract_sentence in enumerate(
                        abstract_sentences):
                    inst_ssi = ssi[abs_idx]
                    if len(inst_ssi) == 0:
                        continue
                    inst_abstract_sentences = abstract_sentence
                    inst_raw_article_sents = util.reorder(
                        raw_article_sents, inst_ssi)
                    inst_article = ' '.join([
                        ' '.join(util.process_sent(sent, whitespace=True))
                        for sent in inst_raw_article_sents
                    ])
                    inst_doc_indices = [0] * len(inst_article.split())
                    inst_article_lcs_paths_list = article_lcs_paths_list[
                        abs_idx]

                    if len(
                            inst_article
                    ) == 0:  # See https://github.com/abisee/pointer-generator/issues/1
                        logging.warning(
                            'Found an example with empty article text. Skipping it.\n*********************************************'
                        )
                    elif len(inst_article.strip().split()
                             ) < 3 and self._hps.skip_with_less_than_3:
                        print(
                            'Article has less than 3 tokens, so skipping\n*********************************************'
                        )
                    elif len(inst_abstract_sentences.strip().split()
                             ) < 3 and self._hps.skip_with_less_than_3:
                        print(
                            'Abstract has less than 3 tokens, so skipping\n*********************************************'
                        )
                    else:
                        inst_example = Example(None, [inst_abstract_sentences],
                                               all_abstract_sentences, None,
                                               inst_raw_article_sents, None,
                                               [inst_article_lcs_paths_list],
                                               self._vocab, self._hps)
                        self._example_queue.put(inst_example)
            else:
                example = Example(None, abstract_sentences,
                                  all_abstract_sentences, None,
                                  raw_article_sents, ssi,
                                  article_lcs_paths_list, self._vocab,
                                  self._hps)  # Process into an Example.
                self._example_queue.put(
                    example)  # place the Example in the example queue.

            # print "example num", counter
            counter += 1
"""
Created on Tue Nov 21 14:20:52 2017

@author: ayesha
"""
from util import reorder
import json
from past import get_verb as get_past_verb
from future import get_verb as get_future_verb
from present_tense import get_verb as get_present_verb
from bibhakti import bibhakti
from phonetics import phonetics
import re
inpt = 'I am going to school with rama'
#print inpt
l, sen_tag = reorder(inpt)
#print l
#print 222
output = [l[0]]
for item in l:
    if re.search(r'(?:VB|MD)', item[0]):
        output.append(item)

#tense = {'VBD', 'MD', 'VBP' or 'VBZ'}
#print 111
#print output
for item in output:
    if item[0] == 'VBD':
        p, c = get_past_verb(output)
        break
    elif item[0] == 'MD':
Esempio n. 19
0
    def decode_iteratively(self, example_generator, total, names_to_types,
                           ssi_list, hps):
        attn_vis_idx = 0
        for example_idx, example in enumerate(
                tqdm(example_generator, total=total)):
            raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, groundtruth_article_lcs_paths_list = util.unpack_tf_example(
                example, names_to_types)
            article_sent_tokens = [
                util.process_sent(sent) for sent in raw_article_sents
            ]
            groundtruth_summ_sents = [[
                sent.strip()
                for sent in groundtruth_summary_text.strip().split('\n')
            ]]
            groundtruth_summ_sent_tokens = [
                sent.split(' ') for sent in groundtruth_summ_sents[0]
            ]

            if ssi_list is None:  # this is if we are doing the upper bound evaluation (ssi_list comes straight from the groundtruth)
                sys_ssi = groundtruth_similar_source_indices_list
                sys_alp_list = groundtruth_article_lcs_paths_list
                if FLAGS.singles_and_pairs == 'singles':
                    sys_ssi = util.enforce_sentence_limit(sys_ssi, 1)
                    sys_alp_list = util.enforce_sentence_limit(sys_alp_list, 1)
                elif FLAGS.singles_and_pairs == 'both':
                    sys_ssi = util.enforce_sentence_limit(sys_ssi, 2)
                    sys_alp_list = util.enforce_sentence_limit(sys_alp_list, 2)
                sys_ssi, sys_alp_list = util.replace_empty_ssis(
                    sys_ssi, raw_article_sents, sys_alp_list=sys_alp_list)
            else:
                gt_ssi, sys_ssi, ext_len, sys_token_probs_list = ssi_list[
                    example_idx]
                sys_alp_list = ssi_functions.list_labels_from_probs(
                    sys_token_probs_list, FLAGS.tag_threshold)
                if FLAGS.singles_and_pairs == 'singles':
                    sys_ssi = util.enforce_sentence_limit(sys_ssi, 1)
                    sys_alp_list = util.enforce_sentence_limit(sys_alp_list, 1)
                    groundtruth_similar_source_indices_list = util.enforce_sentence_limit(
                        groundtruth_similar_source_indices_list, 1)
                    gt_ssi = util.enforce_sentence_limit(gt_ssi, 1)
                elif FLAGS.singles_and_pairs == 'both':
                    sys_ssi = util.enforce_sentence_limit(sys_ssi, 2)
                    sys_alp_list = util.enforce_sentence_limit(sys_alp_list, 2)
                    groundtruth_similar_source_indices_list = util.enforce_sentence_limit(
                        groundtruth_similar_source_indices_list, 2)
                    gt_ssi = util.enforce_sentence_limit(gt_ssi, 2)
                # if gt_ssi != groundtruth_similar_source_indices_list:
                #     raise Exception('Example %d has different groundtruth source indices: ' + str(groundtruth_similar_source_indices_list) + ' || ' + str(gt_ssi))
                if FLAGS.dataset_name == 'xsum':
                    sys_ssi = [sys_ssi[0]]

            final_decoded_words = []
            final_decoded_outpus = ''
            best_hyps = []
            highlight_html_total = '<u>System Summary</u><br><br>'
            for ssi_idx, ssi in enumerate(sys_ssi):
                # selected_article_lcs_paths = None
                selected_article_lcs_paths = sys_alp_list[ssi_idx]
                ssi, selected_article_lcs_paths = util.make_ssi_chronological(
                    ssi, selected_article_lcs_paths)
                selected_article_lcs_paths = [selected_article_lcs_paths]
                selected_raw_article_sents = util.reorder(
                    raw_article_sents, ssi)
                selected_article_text = ' '.join([
                    ' '.join(sent)
                    for sent in util.reorder(article_sent_tokens, ssi)
                ])
                selected_doc_indices_str = '0 ' * len(
                    selected_article_text.split())
                if FLAGS.upper_bound:
                    selected_groundtruth_summ_sent = [[
                        groundtruth_summ_sents[0][ssi_idx]
                    ]]
                else:
                    selected_groundtruth_summ_sent = groundtruth_summ_sents

                batch = create_batch(selected_article_text,
                                     selected_groundtruth_summ_sent,
                                     selected_doc_indices_str,
                                     selected_raw_article_sents,
                                     selected_article_lcs_paths,
                                     FLAGS.batch_size, hps, self._vocab)

                original_article = batch.original_articles[0]  # string
                original_abstract = batch.original_abstracts[0]  # string
                article_withunks = data.show_art_oovs(original_article,
                                                      self._vocab)  # string
                abstract_withunks = data.show_abs_oovs(
                    original_abstract, self._vocab,
                    (batch.art_oovs[0]
                     if FLAGS.pointer_gen else None))  # string
                # article_withunks = data.show_art_oovs(original_article, self._vocab) # string
                # abstract_withunks = data.show_abs_oovs(original_abstract, self._vocab, (batch.art_oovs[0] if FLAGS.pointer_gen else None)) # string

                if FLAGS.first_intact and ssi_idx == 0:
                    decoded_words = selected_article_text.strip().split()
                    decoded_output = selected_article_text
                else:
                    decoded_words, decoded_output, best_hyp = decode_example(
                        self._sess, self._model, self._vocab, batch,
                        example_idx, hps)
                    best_hyps.append(best_hyp)
                final_decoded_words.extend(decoded_words)
                final_decoded_outpus += decoded_output

                if example_idx < 100 or (example_idx >= 2000
                                         and example_idx < 2100):
                    min_matched_tokens = 2
                    selected_article_sent_tokens = [
                        util.process_sent(sent)
                        for sent in selected_raw_article_sents
                    ]
                    highlight_summary_sent_tokens = [decoded_words]
                    highlight_ssi_list, lcs_paths_list, highlight_article_lcs_paths_list, highlight_smooth_article_lcs_paths_list = ssi_functions.get_simple_source_indices_list(
                        highlight_summary_sent_tokens,
                        selected_article_sent_tokens, None, 2,
                        min_matched_tokens)
                    highlighted_html = ssi_functions.html_highlight_sents_in_article(
                        highlight_summary_sent_tokens,
                        highlight_ssi_list,
                        selected_article_sent_tokens,
                        lcs_paths_list=lcs_paths_list,
                        article_lcs_paths_list=
                        highlight_smooth_article_lcs_paths_list)
                    highlight_html_total += highlighted_html + '<br>'

                if FLAGS.attn_vis and example_idx < 200:
                    self.write_for_attnvis(
                        article_withunks, abstract_withunks, decoded_words,
                        best_hyp.attn_dists, best_hyp.p_gens, attn_vis_idx
                    )  # write info to .json file for visualization tool
                    attn_vis_idx += 1

                if len(final_decoded_words) >= 100:
                    break

            gt_ssi_list, gt_alp_list = util.replace_empty_ssis(
                groundtruth_similar_source_indices_list,
                raw_article_sents,
                sys_alp_list=groundtruth_article_lcs_paths_list)
            highlight_html_gt = '<u>Reference Summary</u><br><br>'
            for ssi_idx, ssi in enumerate(gt_ssi_list):
                selected_article_lcs_paths = gt_alp_list[ssi_idx]
                try:
                    ssi, selected_article_lcs_paths = util.make_ssi_chronological(
                        ssi, selected_article_lcs_paths)
                except:
                    util.print_vars(ssi, example_idx,
                                    selected_article_lcs_paths)
                    raise
                selected_raw_article_sents = util.reorder(
                    raw_article_sents, ssi)

                if example_idx < 100 or (example_idx >= 2000
                                         and example_idx < 2100):
                    min_matched_tokens = 2
                    selected_article_sent_tokens = [
                        util.process_sent(sent)
                        for sent in selected_raw_article_sents
                    ]
                    highlight_summary_sent_tokens = [
                        groundtruth_summ_sent_tokens[ssi_idx]
                    ]
                    highlight_ssi_list, lcs_paths_list, highlight_article_lcs_paths_list, highlight_smooth_article_lcs_paths_list = ssi_functions.get_simple_source_indices_list(
                        highlight_summary_sent_tokens,
                        selected_article_sent_tokens, None, 2,
                        min_matched_tokens)
                    highlighted_html = ssi_functions.html_highlight_sents_in_article(
                        highlight_summary_sent_tokens,
                        highlight_ssi_list,
                        selected_article_sent_tokens,
                        lcs_paths_list=lcs_paths_list,
                        article_lcs_paths_list=
                        highlight_smooth_article_lcs_paths_list)
                    highlight_html_gt += highlighted_html + '<br>'

            if example_idx < 100 or (example_idx >= 2000
                                     and example_idx < 2100):
                self.write_for_human(raw_article_sents, groundtruth_summ_sents,
                                     final_decoded_words, example_idx)
                highlight_html_total = ssi_functions.put_html_in_two_columns(
                    highlight_html_total, highlight_html_gt)
                ssi_functions.write_highlighted_html(highlight_html_total,
                                                     self._highlight_dir,
                                                     example_idx)

            # if example_idx % 100 == 0:
            #     attn_dir = os.path.join(self._decode_dir, 'attn_vis_data')
            #     attn_selections.process_attn_selections(attn_dir, self._decode_dir, self._vocab)

            rouge_functions.write_for_rouge(
                groundtruth_summ_sents,
                None,
                example_idx,
                self._rouge_ref_dir,
                self._rouge_dec_dir,
                decoded_words=final_decoded_words,
                log=False
            )  # write ref summary and decoded summary to file, to eval with pyrouge later
            # if FLAGS.attn_vis:
            #     self.write_for_attnvis(article_withunks, abstract_withunks, decoded_words, best_hyp.attn_dists, best_hyp.p_gens, example_idx) # write info to .json file for visualization tool
            example_idx += 1  # this is how many examples we've decoded

        logging.info("Decoder has finished reading dataset for single_pass.")
        logging.info("Output has been saved in %s and %s.",
                     self._rouge_ref_dir, self._rouge_dec_dir)
        if len(os.listdir(self._rouge_ref_dir)) != 0:
            if FLAGS.dataset_name == 'xsum':
                l_param = 100
            else:
                l_param = 100
            logging.info("Now starting ROUGE eval...")
            results_dict = rouge_functions.rouge_eval(self._rouge_ref_dir,
                                                      self._rouge_dec_dir,
                                                      l_param=l_param)
            rouge_functions.rouge_log(results_dict, self._decode_dir)
def generate_summary(article_sent_tokens, qid_ssi_to_importances, example_idx,
                     qid_ssi_to_token_scores_and_mappings):
    qid = example_idx

    summary_sent_tokens = []
    summary_tokens = util.flatten_list_of_lists(summary_sent_tokens)
    already_used_source_indices = []
    similar_source_indices_list = []
    summary_sents_for_html = []
    article_lcs_paths_list = []
    token_probs_list = []
    ssi_length_extractive = None
    while len(summary_tokens) < 300:
        if len(summary_tokens) >= l_param and ssi_length_extractive is None:
            ssi_length_extractive = len(similar_source_indices_list)
        # if FLAGS.dataset_name == 'xsum' and len(summary_tokens) > 0:
        #     ssi_length_extractive = len(similar_source_indices_list)
        #     break
        if FLAGS.use_mmr:
            score_dict = util.calc_MMR_source_indices(article_sent_tokens,
                                                      summary_tokens,
                                                      None,
                                                      qid_ssi_to_importances,
                                                      qid=qid)
        else:
            score_dict = qid_ssi_to_importances[qid]
        sents, source_indices = get_best_source_sents(
            article_sent_tokens, score_dict, already_used_source_indices)
        if len(source_indices) == 0:
            break

        token_scores, token_mappings = get_token_info_for_ssi(
            qid_ssi_to_token_scores_and_mappings, qid, source_indices)
        # if np.max(token_mappings) !=
        token_cons_scores = consolidate_token_scores(token_scores,
                                                     token_mappings)
        if len(token_cons_scores) != len(sents):
            print(token_cons_scores, sents)
            raise Exception('Len of token_cons_scores %d != Len of sents %d' %
                            (len(token_cons_scores), len(sents)))
        padded_token_cons_scores = [
        ]  # we need to pad it, because sometimes the instance was too long for BERT, so it got truncated. So we need to fill the end of the sentences with 0 probabilities.
        for sent_idx, sent_scores in enumerate(token_cons_scores):
            sent = sents[sent_idx]
            if len(sent_scores) > len(sent):
                print(token_cons_scores, sents)
                raise Exception('Len of sent_scores %d > Len of sent %d' %
                                (len(sent_scores), len(sent)))
            while len(sent_scores) < len(sent):
                sent_scores.append(0.)
            padded_token_cons_scores.append(sent_scores)
        token_probs_list.append(padded_token_cons_scores)
        token_tags = threshold_token_scores(
            padded_token_cons_scores, FLAGS.tag_threshold
        )  # shape (1 or 2, len(sent)) 1 or 2 depending on if it is singleton/pair
        article_lcs_paths = ssi_functions.binary_tags_to_list(token_tags)
        article_lcs_paths_list.append(article_lcs_paths)

        # if FLAGS.tag_tokens and FLAGS.tag_loss_wt != 0:
        #     sents_only_tagged = filter_untagged(sents, token_tags)
        #     summary_sent_tokens.extend(sents_only_tagged)
        # else:
        summary_sent_tokens.extend(sents)

        summary_tokens = util.flatten_list_of_lists(summary_sent_tokens)
        similar_source_indices_list.append(source_indices)
        summary_sents_for_html.append(' <br> '.join(
            [' '.join(sent) for sent in sents]))
        if filter_sentences:
            already_used_source_indices.extend(source_indices)
    if ssi_length_extractive is None:
        ssi_length_extractive = len(similar_source_indices_list)
    selected_article_sent_indices = util.flatten_list_of_lists(
        similar_source_indices_list[:ssi_length_extractive])
    summary_sents = [
        ' '.join(sent) for sent in util.reorder(article_sent_tokens,
                                                selected_article_sent_indices)
    ]
    # summary = '\n'.join([' '.join(tokens) for tokens in summary_sent_tokens])
    return summary_sents, similar_source_indices_list, summary_sents_for_html, ssi_length_extractive, article_lcs_paths_list, token_probs_list
Esempio n. 21
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 15 01:03:18 2017

@author: akankshya
"""

from util import reorder
import re
##print reorder('What were you doing yesterday')
##print reorder('When are you going to school')
l = reorder('i was going to school')
#print l
output = [l[0]]
for item in l:
    if re.search(r'(?:VB|MD)', item[0]):
        output.append(item)

print output


#output = filter(output, None)
#print output
def ret_out():
    return output
Esempio n. 22
0
    def fill_example_queue(self):
        """Reads data from file and processes into Examples which are then placed into the example queue."""

        if self._example_generator is None:
            input_gen = self.text_generator(
                data.example_generator(self._data_path, self._single_pass, self._cnn_500_dm_500, is_original=False))
        else:
            input_gen = self.text_generator(self._example_generator)
        counter = 0
        while True:
            try:
                (article,
                 abstracts, doc_indices_str, raw_article_sents, ssi) = next(input_gen)  # read the next example from file. article and abstract are both strings.
            except StopIteration:  # if there are no more examples:
                logging.info("The example generator for this example queue filling thread has exhausted data.")
                if self._single_pass:
                    logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping.")
                    self._finished_reading = True
                    break
                else:
                    raise Exception("single_pass mode is off but the example generator is out of data; error.")

            article = article
            abstracts = [abstract for abstract in abstracts]
            if type(doc_indices_str) != str:
                doc_indices_str = doc_indices_str
            raw_article_sents = [sent for sent in raw_article_sents]

            all_abstract_sentences = [[sent.strip() for sent in data.abstract2sents(
                abstract)] for abstract in abstracts]
            if len(all_abstract_sentences) != 0:
                abstract_sentences = all_abstract_sentences[0]
            else:
                abstract_sentences = []
            doc_indices = [int(idx) for idx in doc_indices_str.strip().split()]
            if self._hps.by_instance:   # if we are running iteratively on only instances (a singleton/pair + a summary sentence), not the whole article
                for abs_idx, abstract_sentence in enumerate(abstract_sentences):
                    inst_ssi = ssi[abs_idx]
                    if len(inst_ssi) == 0:
                        continue
                    inst_abstract_sentences = abstract_sentence
                    inst_raw_article_sents = util.reorder(raw_article_sents, inst_ssi)
                    inst_article = ' '.join([' '.join(util.process_sent(sent, whitespace=True)) for sent in inst_raw_article_sents])
                    inst_doc_indices = [0] * len(inst_article.split())

                    if len(inst_article) == 0:  # See https://github.com/abisee/pointer-generator/issues/1
                        logging.warning(
                            'Found an example with empty article text. Skipping it.\n*********************************************')
                    elif len(inst_article.strip().split()) < 3 and self._hps.skip_with_less_than_3:
                        print(
                            'Article has less than 3 tokens, so skipping\n*********************************************')
                    elif len(inst_abstract_sentences.strip().split()) < 3 and self._hps.skip_with_less_than_3:
                        print(
                            'Abstract has less than 3 tokens, so skipping\n*********************************************')
                    else:
                        inst_example = Example(inst_article, [inst_abstract_sentences], all_abstract_sentences, inst_doc_indices, inst_raw_article_sents, None, self._vocab, self._hps)
                        self._example_queue.put(inst_example)
            else:
                example = Example(article, abstract_sentences, all_abstract_sentences, doc_indices, raw_article_sents, ssi, self._vocab, self._hps)  # Process into an Example.
                self._example_queue.put(example)  # place the Example in the example queue.

            # print "example num", counter
            counter += 1
def main():
    #============================================================================
    #--0.Preparation for training model.
    maxnum_baselearner = 2
    filter_idx_set = set([11, 14, 64, 16])
    paths = json.loads(open("SETTINGS.json").read())

    print("Getting features for deleted papers from the disk files")
    features_conf = [feature for feature in \
            csv.reader(open(paths["trainpos_features"]))]
    features_deleted = [feature for feature in \
            csv.reader(open(paths["trainneg_features"]))]
    train_features = [map(float, x[2:]) for x in features_deleted + features_conf]
    train_features = feature_selection(train_features, filter_idx_set)
    train_target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]

    features_valid = [feature for feature in \
            csv.reader(open(paths["vali_features"]))]
    test_features = [map(float, x[2:]) for x in features_valid]
    test_features = feature_selection(test_features, filter_idx_set)
    author_confirmed = get_confirmed_paper(paths["vali_solution"])
    test_author_paper_ids = [x[:2] for x in features_valid]
    #============================================================================

    #============================================================================
    #--2.Model training (Random forest)
    base_learner_set = []
    classifier = RandomForestClassifier(n_estimators=360,
                                        verbose=2,
                                        n_jobs=4,
                                        min_samples_split=10,
                                        random_state=1)
    base_learner_set.append(classifier)

    regressor = GradientBoostingRegressor(loss='ls',
                                        learning_rate=0.1,
                                        n_estimators=450,
                                        max_depth=2)
    base_learner_set.append(regressor)

    max_map_val = 0.0
    bestnum_baselearner_map = 0
    min_mse_val = 1e5
    bestnum_baselearner_mse = 0
    dynamic_target = [i for i in train_target]
    for i in range(maxnum_baselearner):
        train_predictions = [0.0 for j in range(len(train_target))]
        test_predictions = [0.0 for j in range(len(test_features))]

        base_learner_set[i].fit(train_features, dynamic_target)

        for j in range(i+1):
            if j == 0:
                unit_predictions = base_learner_set[j].predict_proba(train_features)[:,1]
            elif j >= 1:
                unit_predictions = base_learner_set[j].predict(train_features)
            print unit_predictions[0:5]
            print unit_predictions[100:105]
            train_predictions = [pred1+pred2 for pred1, pred2 in\
                    zip(train_predictions, list(unit_predictions))]
            print train_predictions[0:5]
            print train_predictions[100:105]

            if j == 0:
                unit_predictions = base_learner_set[j].predict_proba(test_features)[:,1]
            elif j >= 1:
                unit_predictions = base_learner_set[j].predict(test_features)
            test_predictions = [pred1+pred2 for pred1, pred2 in\
                    zip(test_predictions, list(unit_predictions))]

        dynamic_target = [target-pred for target, pred in\
                zip(train_target, train_predictions)]
        print dynamic_target[:10]

        author_predictions = defaultdict(list)
        paper_predictions = {}
        mse_predictions = []
        mse_labels = []
        for (a_id, p_id), pred in zip(test_author_paper_ids, test_predictions):
            if p_id in author_confirmed[int(a_id)]:
                author_predictions[int(a_id)].append([pred, int(p_id), 1])
                mse_labels.append(1)
            else:
                author_predictions[int(a_id)].append([pred, int(p_id), 0])
                mse_labels.append(0)
            mse_predictions.append(pred)

        for author_id in sorted(author_predictions):
            author_predictions[author_id] = reorder(author_predictions[author_id])
            paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
            paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

        print("Test the prediction results with MAP metric.")
        map_val = calcMAP(paper_predictions, author_confirmed)
        mse_val = mean_squared_error(mse_labels, mse_predictions)
        print "Iteration #%d: MAP value --> %f, MSE value -->%f\n"\
                % (i+1, map_val, mse_val)
        if map_val > max_map_val:
            max_map_val = map_val
            bestnum_baselearner_map = i+1
        print "Best MAP value --> %f, best number of learners --> %d\n"\
                % (max_map_val, bestnum_baselearner_map)
        if mse_val < min_mse_val:
            min_mse_val = mse_val
            bestnum_baselearner_mse = i+1
        print "Best MSE value --> %f, best number of learners --> %d\n"\
                % (min_mse_val, bestnum_baselearner_mse)

    pickle.dump(base_learner_set[0], open('./rf-model.pickle', "w"))
    pickle.dump(base_learner_set[1], open('./gbrt-model.pickle', "w"))
Esempio n. 24
0
def autoprocess(cap, src):
    
    ########################################################################
    # usercount = os.getenv('COUNT')
    # IPCamFeed = 
    # DefaultCameFeed = 
    pathImage = "1.jpg"
    #cap = cv2.VideoCapture(0)
    # url = "http://192.168.1.18:8080/shot.jpg"
    # cap = cv2.VideoCapture(url)
    cap.set(10,160)
    heightImg = 640
    widthImg  = 480

    url = 'http://192.168.43.1:8080/shot.jpg'
    ########################################################################
    
    util.initializeTrackbars()
    count=0

    while True:
        
        if src == 2:
            #success, img = cap.read()
            img_resp = requests.get(url)
            img_arr = np.array(bytearray(img_resp.content), dtype=np.uint8)
            img = cv2.imdecode(img_arr, -1)
        elif src == 1:
            ret, img = cap.read()

        else:img = cv2.imread(pathImage)
        img = cv2.resize(img, (widthImg, heightImg)) # RESIZE IMAGE
        imgBlank = np.zeros((heightImg,widthImg, 3), np.uint8) # CREATE A BLANK IMAGE FOR TESTING DEBUGING IF REQUIRED
        imgGray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # CONVERT IMAGE TO GRAY SCALE
        imgBlur = cv2.GaussianBlur(imgGray, (5, 5), 1) # ADD GAUSSIAN BLUR
        thres=util.valTrackbars() # GET TRACK BAR VALUES FOR THRESHOLDS
        imgThreshold = cv2.Canny(imgBlur,thres[0],thres[1]) # APPLY CANNY BLUR
        kernel = np.ones((5, 5))
        imgDial = cv2.dilate(imgThreshold, kernel, iterations=2) # APPLY DILATION
        imgThreshold = cv2.erode(imgDial, kernel, iterations=1)  # APPLY EROSION
    
        ## FIND ALL COUNTOURS
        imgContours = img.copy() # COPY IMAGE FOR DISPLAY PURPOSES
        imgBigContour = img.copy() # COPY IMAGE FOR DISPLAY PURPOSES
        contours, hierarchy = cv2.findContours(imgThreshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # FIND ALL CONTOURS
        cv2.drawContours(imgContours, contours, -1, (0, 255, 0), 10) # DRAW ALL DETECTED CONTOURS
    
    
        # FIND THE BIGGEST COUNTOUR
        biggest, maxArea = util.biggestContour(contours) # FIND THE BIGGEST CONTOUR
        if biggest.size != 0:
            biggest=util.reorder(biggest)
            cv2.drawContours(imgBigContour, biggest, -1, (0, 255, 0), 20) # DRAW THE BIGGEST CONTOUR
            imgBigContour = util.drawRectangle(imgBigContour,biggest,2)
            pts1 = np.float32(biggest) # PREPARE POINTS FOR WARP
            pts2 = np.float32([[0, 0],[widthImg, 0], [0, heightImg],[widthImg, heightImg]]) # PREPARE POINTS FOR WARP
            matrix = cv2.getPerspectiveTransform(pts1, pts2)
            imgWarpColored = cv2.warpPerspective(img, matrix, (widthImg, heightImg))
    
            #REMOVE 20 PIXELS FORM EACH SIDE
            imgWarpColored=imgWarpColored[20:imgWarpColored.shape[0] - 20, 20:imgWarpColored.shape[1] - 20]
            imgWarpColored = cv2.resize(imgWarpColored,(widthImg,heightImg))
    
            # APPLY ADAPTIVE THRESHOLD
            imgWarpGray = cv2.cvtColor(imgWarpColored,cv2.COLOR_BGR2GRAY)
            imgAdaptiveThre= cv2.adaptiveThreshold(imgWarpGray, 255, 1, 1, 7, 2)
            imgAdaptiveThre = cv2.bitwise_not(imgAdaptiveThre)
            imgAdaptiveThre=cv2.medianBlur(imgAdaptiveThre,3)

            canny_img = Canny_detector(img) 
            tess(imgWarpColored)
    
            # Image Array for Display
            # imageArray = ([img,imgGray,imgThreshold,imgContours],
            #             [imgBigContour,imgWarpColored, imgWarpGray,imgAdaptiveThre])
            imageArray = ([img,imgContours],
                        [imgBigContour,imgAdaptiveThre])
            if count <=0 :
                cv2.imwrite("doc.jpeg", imgAdaptiveThre)
                count+=1
            cases = 1
            lables = [["Original","Contours"],
                ["Biggest Contour","Adaptive Threshold"]]
            stackedImage = util.stackImages(imageArray,0.75,lables)
            cv2.imshow("Result",stackedImage)
            if cases == 1 and cv2.waitKey(25) & 0xFF == ord('s'):
                cv2.imwrite("auto/autodoc"+ str(time.time())+ ".jpg",imgAdaptiveThre)
                print("saved")
            
    
    
        else:
            # imageArray = ([img,imgGray,imgThreshold,imgContours],
            #             [imgBlank, imgBlank, imgBlank, imgBlank])
            imageArray = ([img,imgContours],
                        [img,img])
            lables = [["Original","Contours"],
                ["No Contour","No Adaptive Threshold"]]
            cases = 2
            stackedImage = util.stackImages(imageArray,0.75,lables)
            cv2.imshow("Result",stackedImage)
    
        # # LABELS FOR DISPLAY
        # lables = [["Original","Gray","Threshold","Contours"],
        #         ["Biggest Contour","Warp Prespective","Warp Gray","Adaptive Threshold"]]

        # LABELS FOR DISPLAY
        
    
        # stackedImage = util.stackImages(imageArray,0.75,lables)
        # cv2.imshow("Result",stackedImage)
    
        # SAVE IMAGE WHEN 's' key is pressed
        if cv2.waitKey(1) and 0xFF == ord('q'):
            cv2.rectangle(stackedImage, ((int(stackedImage.shape[1] / 2) - 230), int(stackedImage.shape[0] / 2) + 50),
                        (1100, 350), (0, 255, 0), cv2.FILLED)
            cv2.putText(stackedImage, "Scan Saved", (int(stackedImage.shape[1] / 2) - 200, int(stackedImage.shape[0] / 2)),
                        cv2.FONT_HERSHEY_DUPLEX, 3, (0, 0, 255), 5, cv2.LINE_AA)
            cv2.imshow('Result', stackedImage)
            cv2.waitKey(300)
            print(cases)