def candidate_video_words(training_data_path):
    ppt_time = []
    words = []
    ori_words = []
    words_height = []
    f_obj = open(training_data_path, 'r')
    lines = f_obj.readlines()
    num_slide = int(lines[0])
    idx = 1
    stemmer = porter.PorterStemmer()
    for i in xrange(num_slide):
        slide_name = lines[idx]
        for item in lines[idx + 1].split(','):
            try:
                item = item.encode('ascii', 'ignore').lower()
            except UnicodeDecodeError, e:
                continue
            item = item.split('&')
            if item[0] == '#':
                ori_words.append(item[0])
                words_height.append(1e8)
            else:
                if item[0] not in stem_map:
                    stem_map[item[0]] = stemmer.stem_word(item[0])
                words.append(stem_map[item[0]])
                ori_words.append(item[0])
                words_height.append(float(item[1]))
        idx += 2
def read_keyphrases(path):
    keyphrases = []
    stemmer = porter.PorterStemmer()
    for phrase in open(path):
        stem_phrase = []
        words = phrase.replace('\n', '').lower().split(" ")
        for w in words:
            stem_w = stemmer.stem_word(w)
            stem_phrase.append(stem_w)
        keyphrases.append(" ".join(stem_phrase))
    return keyphrases
def words_filter(text, words, ori_words):
    count = 0
    reg = re.compile(r'([!"#%&()*+,-./:;<=>?@\[\\\]^_`{|}~])', re.IGNORECASE)
    text = re.sub(reg, '#', text)

    stemmer = porter.PorterStemmer()
    for word in text.split(" "):
        count += 1
        if word == '#':
            ori_words.append("#")
        else:
            if word != " " and word != "":
                if word not in stem_map:
                    stem_map[word] = stemmer.stem_word(word)
                words.append(stem_map[word])
                ori_words.append(word)
    return count
def coref_pos_filter(text, words, ori_words):
    output = NLP_SERVER.annotate(text,
                                 properties={
                                     'annotators': 'coref',
                                     'outputFormat': 'json',
                                 })
    # The standard Lucene stopword
    stemmer = porter.PorterStemmer()
    count = 0
    for sentence in output['sentences']:
        #print sentence['index']
        for word in sentence['tokens']:
            count += 1
            if word['pos'] in ['SYM']:
                ori_words.append('#')
            else:
                if word['word'] not in stem_map:
                    stem_map[word['word']] = stemmer.stem_word(word['word'])
                words.append(stem_map[word['word']])
                ori_words.append(word['word'])
    return count