def keyword_statistics(relationship_file, subtitle_file):
    
    relation_list = csv_io.read_csv(relationship_file)
    subtitle = read_subtitle_file(subtitle_file)

    relation_patterns = {}
    for relation in relation_list:
        relation_patterns[relation] = '(^[->]*' + relation.lower() + '[,|\.|\?|\!].*)' + '|' + \
                                      '(?<!(her|his|our|eir|our|\smy|.\sa))\s+' + relation.lower() + '[\.|,|\?|!|>]'
        
    subtitle_interval = []
    time_to_keyword = []
    keyword_list = []
    for line in subtitle:
        if line.strip():
            subtitle_interval.append(line)
            if len(subtitle_interval) < 2:
                continue

            if len(subtitle_interval) == 2:
                subtitle_time = line[:-2]
                continue
            
            time_to_keyword, keyword_list = keyword_matching(relation_patterns, line, subtitle_time,\
                                                             time_to_keyword, keyword_list)
        else:
            subtitle_interval=[]

    frame_to_keyword = to_frame_keyword(time_to_keyword)

    csv_io.write_csv(OUTPUT_ROOT_PATH + 'statistics_result.csv', frame_to_keyword)
    csv_io.write_csv(OUTPUT_ROOT_PATH + 'keyword_list.csv', [keyword_list])
def social_reconstruction(keyword_list_file, relations_file):
    
    keyword_list = csv_io.read_csv(keyword_list_file)
    relations = json_io.read_json(relations_file)

    relation_graph = {'nodes': [], 'links': []}

    node_index = {}
    index = 0
    for keyword in keyword_list:
        if keyword not in node_index:
            relation_graph['nodes'].append({'name': keyword, 'group': index, 'ID': index})
            node_index[keyword] = index
            index += 1

    for name, relation in relations.iteritems():
        #total = sum(relation.values())
        for person in relation:
            #if total != 0 and (float(relation[person]) / total > (1.0/len(relation)) - 0.03 ):
            relation_graph['links'].append({'source': node_index[name], 'target': node_index[person],
                                           'value': relation[person], 'label': person })
            relation_graph['links'].append({'source': node_index[person], 'target': node_index[name],
                                           'value': relation[person], 'label': name })
    print relation_graph
    json_io.write_json('output/result/relation_graph.json', relation_graph)
def keyword_statistics(relationship_file, subtitle_file):

    relation_list = csv_io.read_csv(relationship_file)
    subtitle = read_subtitle_file(subtitle_file)

    relation_patterns = {}
    for relation in relation_list:
        relation_patterns[relation] = '(^[->]*' + relation.lower() + '[,|\.|\?|\!].*)' + '|' + \
                                      '(?<!(her|his|our|eir|our|\smy|.\sa))\s+' + relation.lower() + '[\.|,|\?|!|>]'

    subtitle_interval = []
    time_to_keyword = []
    keyword_list = []
    for line in subtitle:
        if line.strip():
            subtitle_interval.append(line)
            if len(subtitle_interval) < 2:
                continue

            if len(subtitle_interval) == 2:
                subtitle_time = line[:-2]
                continue

            time_to_keyword, keyword_list = keyword_matching(relation_patterns, line, subtitle_time,\
                                                             time_to_keyword, keyword_list)
        else:
            subtitle_interval = []

    frame_to_keyword = to_frame_keyword(time_to_keyword)

    csv_io.write_csv(OUTPUT_ROOT_PATH + 'statistics_result.csv',
                     frame_to_keyword)
    csv_io.write_csv(OUTPUT_ROOT_PATH + 'keyword_list.csv', [keyword_list])
def reconstruct_role(recongition_merge_file, keword_list_file):

    keyword_to_frame = json_io.read_json(recongition_merge_file)
    keword_list = csv_io.read_csv(keword_list_file)

    leading_keyword = keword_list[0]

    for keyword, frame_list in keyword_to_frame.iteritems():
        for frame in frame_list:
            for face in frame_list[frame]:
                name = keyword + str(face['frame_position']) + '.jpg'
                face['img'] = cv2.imread(OUTPUT_PATH + '/img/' + name)

    detector, matcher =  cv_face.init_feature('orb')
    # Find other characters
    face_list = {}
    character_list = {}
    for keyword, frame_list in keyword_to_frame.iteritems():
        print keyword 
        for frame in frame_list:
            for face in frame_list[frame]:
                if face and face['face_id'] not in face_list:
                    face_list[face['face_id']] = []
                if face:
                    face_list[face['face_id']].append(face)
        rank  = sorted(face_list, key=lambda k: len(face_list[k]), reverse=True)
        character_list[keyword] = [face_list[rank[0]]]
        i=0
def movie_prosessing(movie_file, two_entity_file, search_result_file):
    two_entity_set = json_io.read_json(two_entity_file)
    keyword_search_result = csv_io.read_csv(search_result_file)

    # load video
    videoInput = cv2.VideoCapture(movie_file)

    # crate a start_frame to end_frame dictionary for two_entity_set look up
    start_end = {}
    for row in keyword_search_result:
        start_frame, end_frame = time_format.to_frame(row)
        while start_frame in start_end:
            start_frame = start_frame + 0.001
        while end_frame in start_end:
            end_frame = end_frame + 0.001
        start_end[start_frame] = end_frame 

    frame = {}
    face_count = 0
    for keyword in two_entity_set:
        for start_frame in two_entity_set[keyword]:
            frame_position = int(start_frame) - 24 * 10
            finish_frame = start_end[start_frame] + 24 * 10
            while frame_position <= finish_frame: 
                print keyword
                videoInput.set(cv2.cv.CV_CAP_PROP_POS_FRAMES, frame_position)
                flag, img = videoInput.read()
                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                gray = cv2.equalizeHist(gray)
                face_position_list, rects = cv_image.face_detect(gray, frame_position, (85, 85))
                #face_position_list, rects =  faceDetection(gray, frame_position)
                if 0xFF & cv2.waitKey(5) == 27:
                    cv2.destroyAllWindows()
                    sys.exit(1)
                
                if len(face_position_list) == 1:
                    print 'detected'
                    image_name = keyword + str(frame_position) + '.jpg'
                    cv_image.output_image(rects, img, OUTPUT_PATH + '/img/' + image_name)
                    for face_position in face_position_list:
                        face_count += 1
                        print face_count
                        frame[face_count] = { 'keyword' : keyword, 
                                                  'face_position': face_position.tolist(),
                                                  'ID' : face_count,
                                                  'frame_position': frame_position,
                                                  'face_id': face_count} 
                frame_position += FRAME_INTERVAL
    #close video  
    videoInput.release()

    json_io.write_json(OUTPUT_PATH + 'frame.json', frame) 
def find_relation(keyword_list_file, search_result_file, time_interval):
 
    time_to_keyword = csv_io.read_csv(search_result_file)
    keyword_list = csv_io.read_csv(keyword_list_file)
    leading_keyword = keyword_list[0]
    
    frame_to_keyword = {}
    for row in time_to_keyword:
        start_frame, end_frame = time_format.to_frame(row)
        while start_frame in frame_to_keyword:
            start_frame = start_frame + 0.001
        while end_frame in frame_to_keyword:
            end_frame = end_frame + 0.001
        frame_to_keyword[start_frame] = row[1]
        
    # Transfrom to timeline format
    frame_list = frame_to_keyword.keys()
    frame_list.sort()
    
    relations = {}
    for i in range(1, len(keyword_list)):
        relations.update( {keyword_list[i] : count_ralation(keyword_list[i], frame_list, frame_to_keyword, time_interval)} )

    count = 0
    proper_relation = {}
    for name, relation in relations.iteritems():
        total = sum(relation.values())
        proper_relation[name] = {}
        print name, 
        for person in relation:
            if proper_test(total, leading_keyword, person, relation):
                proper_relation[name][person] = relation[person]
                print person , relation[person],
                count += 1
        print


    print str(time_interval/(24*60)) + ',' + str(count)
    json_io.write_json('output/relations.json', proper_relation)
def video_processing(movie_file, role_list_file, search_result_file,
                     role_input_way):

    # load frame-keyword files
    keyword_search_result = csv_io.read_csv(search_result_file)
    role_list = csv_io.read_csv(role_list_file)

    # load video
    videoInput = cv2.VideoCapture(movie_file)

    frame = {}
    keyword_id = 0
    frame_number = 0
    for row in keyword_search_result:

        start_frame, end_frame, keyword = float(row[0]), float(row[1]), row[2]
        frame_position = round(start_frame) + 24 * BACKWORD_EXPAND_TIME
        finish_frame = round(end_frame) + 24 * FORWORD_EXPAND_TIME

        keyword_id += 1
        keyword_time = keyword + '_t' + str(keyword_id)
        while frame_position <= finish_frame:

            face_position_list, rects, img = frame_caputre(
                videoInput, frame_position)

            if len(face_position_list) >= 1:
                print "detect face..."

                image_name = OUTPUT_PATH + 'img/' + keyword_time + str(
                    frame_number)
                #if role_input_way == 0:
                cv_image.output_image(rects, img, image_name)

                count = 0
                for face_position in face_position_list:

                    if role_input_way == 'auto':
                        break
                        role_name = role_identify(
                            image_name + '-' + str(count) + '.jpg', role_list)
                    else:
                        role_name = role_input(role_list)
                    count += 1
                    if role_name == -1:
                        continue
                    else:
                        if keyword_time not in frame:
                            print keyword_time, role_name
                            frame[keyword_time] = {}
                            frame[keyword_time][role_name] = {
                                'keyword': keyword,
                                'face_position': face_position.tolist(),
                                'frame_position': frame_position,
                                'keyword_id': keyword_id,
                                'weight': 1,
                                'speaker': True
                            }
                        elif role_name in frame[keyword_time]:
                            frame[keyword_time][role_name]['weight'] += 1
                        else:
                            frame[keyword_time][role_name] = {
                                'keyword': keyword,
                                'face_position': face_position.tolist(),
                                'frame_position': frame_position,
                                'keyword_id': keyword_id,
                                'weight': 1,
                                'speaker': False
                            }
            frame_number += 1
            frame_position += FRAME_INTERVAL

    #close video
    videoInput.release()

    json_io.write_json(OUTPUT_PATH + 'keywordt_roles.json', frame)
def keyword_search(name_file, relationship_file, subtitle_file):
    
    # Read files 
    name_list = csv_io.read_csv(name_file)
    relation_list = csv_io.read_csv(relationship_file)
    subtitle = read_subtitle_file(subtitle_file)

    # Create regular expression pattern for reuse 
    name_patterns = {}
    for name in name_list:
        name_patterns[name] = '[\s]*' + name.lower() + "[^'\w]" 

    relation_patterns = {}
    for relation in relation_list:
        relation_patterns[relation] = '[\s]*' + relation.lower() + "[^'\w]"

    # Find keyword
    time_to_keyword = []
    subtitle_interval = []
    keyword_number = 0
    keyword_list = [""]
    keyword_count = {}
    for line in subtitle:
        if line.strip():
            subtitle_interval.append(line)
            if len(subtitle_interval) < 2:
                continue

            if len(subtitle_interval) == 2:
                subtitle_time = line[:-2]
                continue

            for name in name_patterns:
                if keyword_number < MAX_KEYWORDS_IN_ONE_INTERVAL and re.search(name_patterns[name], line.lower()):
                    time_to_keyword.append([subtitle_time, name])
                    keyword_number += 1
                    if name not in keyword_list:
                       keyword_list.append(name)
                       keyword_count[name] = 1
                    else:
                       keyword_count[name] += 1 
                         
            for relation in relation_patterns:
                if keyword_number < MAX_KEYWORDS_IN_ONE_INTERVAL and re.search(relation_patterns[relation], line.lower()):
                    time_to_keyword.append([subtitle_time, relation])
                    keyword_number += 1
                    if relation not in keyword_list:
                       keyword_list.append(relation) 
                       keyword_count[relation] = 1
                    else:
                       keyword_count[relation] += 1 
        else:
            if keyword_number == MAX_KEYWORDS_IN_ONE_INTERVAL:
                for i in range(MAX_KEYWORDS_IN_ONE_INTERVAL):
                    time_to_keyword.pop()
            subtitle_interval=[]
            keyword_number=0

    count = Counter(values[1] for values in time_to_keyword)
    total_count = sum(keyword_count.values())
    
    filter_list = []

    for name, freq in count.iteritems():
        if float(freq)/total_count >= 0.012:
            print name
        else:
            filter_list.append(name)

    for name in filter_list:
        keyword_list.remove(name)
        time_to_keyword = list( (values[0], values[1]) for values in time_to_keyword if values[1] != name) 


    # Find the max keyword count as leading keyword
    keyword_list[0] = max(keyword_count, key=keyword_count.get)

    csv_io.write_csv(OUTPUT_ROOT_PATH + 'search_result.csv', time_to_keyword)
    csv_io.write_csv(OUTPUT_ROOT_PATH + 'keyword_list.csv', [keyword_list])
def video_processing(movie_file, role_list_file, search_result_file, role_input_way):

    # load frame-keyword files
    keyword_search_result = csv_io.read_csv(search_result_file)
    role_list = csv_io.read_csv(role_list_file)

    # load video
    videoInput = cv2.VideoCapture(movie_file)

    frame = {}
    keyword_id = 0
    frame_number = 0
    for row in keyword_search_result:

        start_frame, end_frame, keyword = float(row[0]), float(row[1]), row[2]
        frame_position = round(start_frame) + 24 * BACKWORD_EXPAND_TIME
        finish_frame = round(end_frame) + 24 * FORWORD_EXPAND_TIME
        
        keyword_id += 1
        keyword_time = keyword + '_t' + str(keyword_id)
        while frame_position <= finish_frame: 

            face_position_list, rects, img = frame_caputre(videoInput, frame_position)
            
            if len(face_position_list) >= 1:
                print "detect face..."
                
                image_name = OUTPUT_PATH + 'img/' + keyword_time + str(frame_number) 
                #if role_input_way == 0:
                cv_image.output_image(rects, img, image_name)
              
                count = 0
                for face_position in face_position_list:
                    
                    if role_input_way == 'auto':
                        break
                        role_name = role_identify( image_name + '-' + str(count) + '.jpg', role_list)
                    else:
                        role_name = role_input(role_list)
                    count += 1
                    if role_name == -1:
                        continue
                    else:
                        if keyword_time not in frame:
                            print keyword_time, role_name
                            frame[keyword_time] = {}
                            frame[keyword_time][role_name] = {'keyword' : keyword, 
                                                              'face_position' : face_position.tolist(),
                                                              'frame_position' : frame_position,
                                                              'keyword_id' : keyword_id,
                                                              'weight' : 1,
                                                              'speaker': True} 
                        elif role_name in frame[keyword_time]:
                            frame[keyword_time][role_name]['weight'] += 1
                        else:
                            frame[keyword_time][role_name] = {'keyword' : keyword, 
                                                              'face_position' : face_position.tolist(),
                                                              'frame_position' : frame_position,
                                                              'keyword_id' : keyword_id,
                                                              'weight' : 1,
                                                              'speaker': False} 
            frame_number += 1
            frame_position += FRAME_INTERVAL

    #close video  
    videoInput.release()

    json_io.write_json(OUTPUT_PATH + 'keywordt_roles.json', frame) 
    for term, count in term_tf.iteritems():
        count = count / length
    
    json_io.write_json(output_path+doc+'.json', term_tf)

def to_db(mydb, term_id, document_list, doc_hash, input_dir):
    for doc in document_list:
        terms_tf = json_io.read_json(input_dir+doc)
        for term, tf in terms_tf.iteritems():
            term = term.replace("'", "")
            if len(term) > 255:
                term = term[:254]
            sql = "INSERT INTO doc_lookups (doc_id,title,tf,term_id) VALUES (" \
                    + "'" + str(doc_hash[doc[:-5]]) + "','" + doc[:-5]  + "','" + str(tf) + "','" + str(term_id[term]) + "');"
            mydb.exe_sql(sql)

if __name__=='__main__':
    if len(sys.argv) > 1:
        doc_hash = csv_io.read_csv('output/doc_hash.json')
        input_dir = sys.argv[1]
        output_dir = 'output/zh_tf/'
    else:
        doc_hash = csv_io.read_csv('output/doc_hash.json')
        input_dir = 'output/en_tokens/'
        output_dir = 'output/en_tf/'

    document_list = get_docs_list(input_dir)
    for doc in document_list:
        terms = csv_io.read_csv(input_dir+doc)
        tf(terms, output_dir)
Example #11
0
 def __init__(self):
     self.relative_path = os.path.join("my_class/")
     self.stopword_list = csv_io.read_csv(self.relative_path + 'stopword.csv') + [u',']
     self.stemmer = PorterStemmer()
def reconstruct_role(recongition_merge_file, keword_list_file):

    keyword_to_frame = json_io.read_json(recongition_merge_file)
    keword_list = csv_io.read_csv(keword_list_file)

    leading_keyword = keword_list[0]

    for keyword, frame_list in keyword_to_frame.iteritems():
        for frame in frame_list:
            for face in frame_list[frame]:
                name = keyword + str(face['frame_position']) + '.jpg'
                face['img'] = cv2.imread(OUTPUT_PATH + '/img/' + name)

    detector, matcher =  cv_face.init_feature('orb')
    # Find other characters
    face_list = {}
    character_list = {}
    for keyword, frame_list in keyword_to_frame.iteritems():
        print keyword 
        for frame in frame_list:
            for face in frame_list[frame]:
                if face and face['face_id'] not in face_list:
                    face_list[face['face_id']] = []
                if face:
                    face_list[face['face_id']].append(face)
        rank  = sorted(face_list, key=lambda k: len(face_list[k]), reverse=True)
        character_list[keyword] = [face_list[rank[0]]]
        i=0
        #for face in face_list[rank[5]]:
         #   i+=1
        for j in rank:
            face = face_list[j][0]
            cv2.imwrite(OUTPUT_PATH + '/result2/' + keyword + str(i) + '.jpg', face['img'])
            i += 1
        if len(rank) > 1 and '-' in keyword:
            character_list[keyword].append(face_list[rank[1]])
        '''    for i in range(1, len(rank)):
                if cv_face.list_match(MIN_MATCH, character_list[keyword][0], face_list[rank[i]], detector, matcher):
                    continue
                else:
                    character_list[keyword].append(face_list[rank[i]])
                    break 
            if len(character_list[keyword]) == 1:
                character_list[keyword].append(face_list[rank[1]])'''
        face_list = {}
        print


    role_list = {}
    # Use leading role image to check
    lead_role_list = character_list[leading_keyword]
    for keyword, characters in character_list.iteritems():
        if keyword == leading_keyword or len(characters) < 2:
            continue

        if leading_keyword in keyword:
            print keyword, '---'
            match_count1 = 0
            match_count2 = 0
            for face in character_list[leading_keyword][0]:
                match_count1 += cv_face.get_match_rate(face['img'], characters[0][0]['img'])
            cv2.imwrite(OUTPUT_PATH + '/result/' + '000' + keyword + '.jpg', characters[0][0]['img'])
            for face in character_list[leading_keyword][0]:
                match_count2 += cv_face.get_match_rate(face['img'], characters[1][0]['img'])
            cv2.imwrite(OUTPUT_PATH + '/result/' + '001' + keyword + '.jpg', characters[1][0]['img'])
            if match_count1 > match_count2:
                print 'characters1', match_count1, match_count2
                del characters[0]
            else:
                print 'characters2', match_count1, match_count2
                del characters[1]
            role_list[keyword.split('-')[0]] = characters[0] 
        
    
    for keyword, characters in character_list.iteritems():
        if leading_keyword in keyword or len(characters) < 2:
            continue
        important_person = keyword.split('-')[1]
        if important_person in role_list: 
            print keyword, important_person, '---'
            match_count1 = 0
            match_count2 = 0
            for face in role_list[important_person]:
                match_count1 += cv_face.get_match_rate(face['img'], characters[0][0]['img'])
            #cv2.imwrite(OUTPUT_PATH + '/result/' + '000' + keyword + '.jpg', characters[0][0]['img'])
            for face in role_list[important_person]:
                match_count2 += cv_face.get_match_rate(face['img'], characters[1][0]['img'])
            #cv2.imwrite(OUTPUT_PATH + '/result/' + '001' + keyword + '.jpg', characters[1][0]['img'])
            if match_count1 > match_count2:
                print 'characters1', match_count1, match_count2
                del characters[0]
            else:
                print 'characters2', match_count1, match_count2
                del characters[1]
        else:
            del characters[1]
    

        

    
    # Output
    for keyword, characters in character_list.iteritems():
        for character in characters:
            if '-' in keyword:
                keyword = keyword.split('-')[0]
            cv2.imwrite(OUTPUT_PATH + '/result/' + keyword + '.jpg', character[0]['img'])
Example #13
0
 def __init__(self):
     self.relative_path = os.path.join("my_class/")
     self.stopword_list = csv_io.read_csv(self.relative_path +
                                          'stopword.csv') + [u',']
     self.stemmer = PorterStemmer()
Example #14
0
    json_io.write_json(output_path + doc + '.json', term_tf)


def to_db(mydb, term_id, document_list, doc_hash, input_dir):
    for doc in document_list:
        terms_tf = json_io.read_json(input_dir + doc)
        for term, tf in terms_tf.iteritems():
            term = term.replace("'", "")
            if len(term) > 255:
                term = term[:254]
            sql = "INSERT INTO doc_lookups (doc_id,title,tf,term_id) VALUES (" \
                    + "'" + str(doc_hash[doc[:-5]]) + "','" + doc[:-5]  + "','" + str(tf) + "','" + str(term_id[term]) + "');"
            mydb.exe_sql(sql)


if __name__ == '__main__':
    if len(sys.argv) > 1:
        doc_hash = csv_io.read_csv('output/doc_hash.json')
        input_dir = sys.argv[1]
        output_dir = 'output/zh_tf/'
    else:
        doc_hash = csv_io.read_csv('output/doc_hash.json')
        input_dir = 'output/en_tokens/'
        output_dir = 'output/en_tf/'

    document_list = get_docs_list(input_dir)
    for doc in document_list:
        terms = csv_io.read_csv(input_dir + doc)
        tf(terms, output_dir)