コード例 #1
0
ファイル: ocr_insert.py プロジェクト: RedHenLab/CDI
def ocr_find_story(ocr_results, ocr_time, stories):
    reprog = re.compile(r"""\d\d\d\d\d\d\d\d\d\d(\d\d)(\d\d).+""", re.I | re.X)
    m = reprog.match(ocr_time)
    g = m.groups()
    start_time_minute = int(g[0])
    start_time_second = int(g[1])
    time_range_left_min = max([start_time_minute - 3, 0])
    time_range_right_min = min([start_time_minute + 3, 59])
    ocr_words = cleansing.clean(ocr_results)
    ocr_words = [w for w in ocr_words if len(w) > 2]
    if len(ocr_words) < 1:
        return -1
    overlap = 0
    for i, story in enumerate(stories):
        if time_overlap(time_range_left_min, time_range_right_min, start_time_second, story):
            #print count
            p = words_overlap_percentage(ocr_words, story)
            #print ocr_words[0]
            #print p
            if p > overlap:
                overlap = p
                story_id = i
    if overlap >= 0.4:
        return story_id
    else:
        return -1
コード例 #2
0
ファイル: phrase-count.py プロジェクト: RedHenLab/CDI
def _read_caption_and_clean(captionfile):
    words = []
    with codecs.open(captionfile, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            if line[0] == '2' or line[0:3] in g_content_tags:
                content = line.split('|')[-1]
                words.extend(content.split())
    return cleansing.clean(words)
コード例 #3
0
ファイル: readingfiles.py プロジェクト: RedHenLab/CDI
def read_testing_file(filenameprefix):
    """Read the triplets files of the segments that correspond to the test file"""
    file_name = 'data/transformed_triplet_files/' + filenameprefix + '*.txt'
    files = glob.glob(file_name)
    files.sort()
    true_segment = []
    all_sentences = []
    line_count_total = 0
    for segments_file in files:
        # Delete the teaser files
        if (segments_file.split('/')[-1].split('_')[-1].split('|')[0].split(':')[-1] == 'Teaser'
           or segments_file.split('/')[-1].split('_')[-1].split('.')[0] == 'NULL'):
            continue
        line_count = -1
        current_seg_sentences = []
        with open(segments_file, 'r') as f:
            for line in f:
                if(line[0] != '<'):
                    line_count += 1
                    line = (line[:-2]).lower()
                    triplets = line.split('|')

                    np1 = triplets[0].split()
                    if np1 != [] and _is_pronoun(np1[0]):
                        pronoun_flag = True
                    else:
                        pronoun_flag = False
                    np1 = cleansing.clean(triplets[0].split())
                    vp = cleansing.clean(triplets[1].split())
                    np2 = cleansing.clean(triplets[2].split())
                    current_seg_sentences.append(Sentence(np1, vp, np2, pronoun_flag))

        # Only keep segments longer than 5 sentences
        segment_length = len(current_seg_sentences)
        if (segment_length > 5):
            seg = [(sid + line_count_total) for sid in range(0, segment_length)]
            true_segment.append(set(seg))
            all_sentences.extend(current_seg_sentences)
            line_count_total += segment_length
    return [all_sentences, true_segment]
コード例 #4
0
ファイル: readingfiles.py プロジェクト: RedHenLab/CDI
def read_triplet_file(triplet_filename, use_ocr=False):
    ocr_file = None
    np1_words = []
    vp_words = []
    np2_words = []
    count = 0
    with open(triplet_filename, 'r') as f:
        for line in f:
            if(line[0] != '<'):
                count += 1
                line = (line[:-2]).lower()
                triplets = line.split('|')
                np1_words.extend(cleansing.clean(triplets[0].split()))
                if len(triplets) == 3:
                    vp_words.extend(cleansing.clean(triplets[1].split()))
                    np2_words.extend(cleansing.clean(triplets[2].split()))
    #if count < 10:
    #    return OrignalDocument('', '', [], [], [], [])
    ocr_words = []
    if use_ocr:
        #name_tmp = triplet_filename.split('&')[-2] + '_' + triplet_filename.split('&')[-1].replace('.txt', '.ocr')
        #ocr_file = 'data/ocr_result_080819-081015/' + name_tmp.lower()
        name_tmp = triplet_filename[:-4].split('/')[-1]
        ocr_file = 'data/ocr_result_ori/' + name_tmp.lower()
        if os.path.exists(ocr_file):
            with open(ocr_file, 'r') as f:
                for line in f:
                    if(line[0] != '<'):
                        line = (line[:-2]).lower()
                        ocr_words.extend(cleansing.clean(line.split()))

    #timestamp = datetime.datetime.strptime((triplet_filename.split('/')[-1]).split('&')[1].split('.')[0], '%Y%m%d%H%M%S')
    #name_tmp = triplet_filename.split('/')[-1][:-4].split('&')
    #filename = name_tmp[1] + '&' + name_tmp[2]
    timestamp = datetime.datetime.strptime((triplet_filename.split('/')[-1]).split('_')[0].split('.')[0], '%Y%m%d%H%M%S')
    filename = triplet_filename.split('/')[-1][:-4]
    return OrignalDocument(filename, timestamp, np1_words, vp_words, np2_words, ocr_words)
コード例 #5
0
ファイル: preprocessing.py プロジェクト: RedHenLab/CDI
def filter_story(filename):
    filtered_words = []

    # read in story
    with codecs.open(filename, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.lower()

            # only split and filter content lines
            # TAG|timestamp|conent
            parts = line.split('|')
            if (parts[0] in CONTENT_LINE_TAG):
                word_list = parts[-1].split(' ')
                word_list = cleansing.clean(word_list)
                filtered_words.extend(word_list)

    # write the cleaned story to disk
    filterd_story = ' '.join(filtered_words)
    parts = filename.split('/')
    newfilename = '/dataset/08cleaned/' + parts[-1]
    with codecs.open(newfilename + '.txt', 'w', encoding='ISO-8859-1') as f:
        f.write(filterd_story)
コード例 #6
0
ファイル: story.py プロジェクト: hang-qi/CDI
 def add_line(self, line):
     words = cleansing.clean(line.split())
     for w in words:
         self.line_list.append(w)