def keyword_statistics(relationship_file, subtitle_file):
    
    relation_list = csv_io.read_csv(relationship_file)
    subtitle = read_subtitle_file(subtitle_file)

    relation_patterns = {}
    for relation in relation_list:
        relation_patterns[relation] = '(^[->]*' + relation.lower() + '[,|\.|\?|\!].*)' + '|' + \
                                      '(?<!(her|his|our|eir|our|\smy|.\sa))\s+' + relation.lower() + '[\.|,|\?|!|>]'
        
    subtitle_interval = []
    time_to_keyword = []
    keyword_list = []
    for line in subtitle:
        if line.strip():
            subtitle_interval.append(line)
            if len(subtitle_interval) < 2:
                continue

            if len(subtitle_interval) == 2:
                subtitle_time = line[:-2]
                continue
            
            time_to_keyword, keyword_list = keyword_matching(relation_patterns, line, subtitle_time,\
                                                             time_to_keyword, keyword_list)
        else:
            subtitle_interval=[]

    frame_to_keyword = to_frame_keyword(time_to_keyword)

    csv_io.write_csv(OUTPUT_ROOT_PATH + 'statistics_result.csv', frame_to_keyword)
    csv_io.write_csv(OUTPUT_ROOT_PATH + 'keyword_list.csv', [keyword_list])
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-

import sys
from my_class.Document import Document
from doc_preprocessing import get_docs_list
from modules import csv_io

def n_gram(content, n):
    tokens = []
    for i in range(len(content)-n+1):
        tokens.append(content[i:i+n].encode('utf-8'))
    return tokens

if __name__=='__main__':
    if len(sys.argv) > 1: 
        doc_input = sys.argv[1]
    else:
        doc_input = 'output/zh_doc/'

    document_list = get_docs_list(doc_input)
    doc_id = 1
    for doc in document_list:
        doc_obj = Document(doc_id, doc, doc_input)
        content = doc_obj.read().decode('utf-8')
        tokens = n_gram(content, 2)
        csv_io.write_csv('output/zh_tokens/' + doc, [tokens])
        del doc_obj
        doc_id += 1

def keyword_search(name_file, relationship_file, subtitle_file):
    
    # Read files 
    name_list = csv_io.read_csv(name_file)
    relation_list = csv_io.read_csv(relationship_file)
    subtitle = read_subtitle_file(subtitle_file)

    # Create regular expression pattern for reuse 
    name_patterns = {}
    for name in name_list:
        name_patterns[name] = '[\s]*' + name.lower() + "[^'\w]" 

    relation_patterns = {}
    for relation in relation_list:
        relation_patterns[relation] = '[\s]*' + relation.lower() + "[^'\w]"

    # Find keyword
    time_to_keyword = []
    subtitle_interval = []
    keyword_number = 0
    keyword_list = [""]
    keyword_count = {}
    for line in subtitle:
        if line.strip():
            subtitle_interval.append(line)
            if len(subtitle_interval) < 2:
                continue

            if len(subtitle_interval) == 2:
                subtitle_time = line[:-2]
                continue

            for name in name_patterns:
                if keyword_number < MAX_KEYWORDS_IN_ONE_INTERVAL and re.search(name_patterns[name], line.lower()):
                    time_to_keyword.append([subtitle_time, name])
                    keyword_number += 1
                    if name not in keyword_list:
                       keyword_list.append(name)
                       keyword_count[name] = 1
                    else:
                       keyword_count[name] += 1 
                         
            for relation in relation_patterns:
                if keyword_number < MAX_KEYWORDS_IN_ONE_INTERVAL and re.search(relation_patterns[relation], line.lower()):
                    time_to_keyword.append([subtitle_time, relation])
                    keyword_number += 1
                    if relation not in keyword_list:
                       keyword_list.append(relation) 
                       keyword_count[relation] = 1
                    else:
                       keyword_count[relation] += 1 
        else:
            if keyword_number == MAX_KEYWORDS_IN_ONE_INTERVAL:
                for i in range(MAX_KEYWORDS_IN_ONE_INTERVAL):
                    time_to_keyword.pop()
            subtitle_interval=[]
            keyword_number=0

    count = Counter(values[1] for values in time_to_keyword)
    total_count = sum(keyword_count.values())
    
    filter_list = []

    for name, freq in count.iteritems():
        if float(freq)/total_count >= 0.012:
            print name
        else:
            filter_list.append(name)

    for name in filter_list:
        keyword_list.remove(name)
        time_to_keyword = list( (values[0], values[1]) for values in time_to_keyword if values[1] != name) 


    # Find the max keyword count as leading keyword
    keyword_list[0] = max(keyword_count, key=keyword_count.get)

    csv_io.write_csv(OUTPUT_ROOT_PATH + 'search_result.csv', time_to_keyword)
    csv_io.write_csv(OUTPUT_ROOT_PATH + 'keyword_list.csv', [keyword_list])
from doc_preprocessing import get_docs_list
from modules import json_io
from modules import csv_io

if __name__=='__main__':
    if len(sys.argv) > 1: 
        doc_input = sys.argv[1]
    else:
        doc_input = 'output/en_doc/'
   
    document_list = get_docs_list(doc_input)
    tokenizer = Tokenizer()
    doc_id = 1
    for doc in document_list:
        doc_obj = Document(doc_id, doc, doc_input)
        # tokenize
        normalize_tokens = []
        for line in doc_obj.get_lines():
            tokens = tokenizer.to_tokens(line.decode('utf-8'))
            for token in tokens:
                if tokenizer.is_stop_word(token):
                    token = ""
                elif token.isdigit():
                    normalize_tokens.append(token.encode('utf-8'))
                else:
                    token = tokenizer.stemming(token)
                    normalize_tokens.append(token.encode('utf-8'))
        csv_io.write_csv('output/en_tokens/' + doc, [normalize_tokens])
        del doc_obj
        doc_id += 1