def candidate_video_words(training_data_path): ppt_time = [] words = [] ori_words = [] words_height = [] f_obj = open(training_data_path, 'r') lines = f_obj.readlines() num_slide = int(lines[0]) idx = 1 stemmer = porter.PorterStemmer() for i in xrange(num_slide): slide_name = lines[idx] for item in lines[idx + 1].split(','): try: item = item.encode('ascii', 'ignore').lower() except UnicodeDecodeError, e: continue item = item.split('&') if item[0] == '#': ori_words.append(item[0]) words_height.append(1e8) else: if item[0] not in stem_map: stem_map[item[0]] = stemmer.stem_word(item[0]) words.append(stem_map[item[0]]) ori_words.append(item[0]) words_height.append(float(item[1])) idx += 2
def read_keyphrases(path): keyphrases = [] stemmer = porter.PorterStemmer() for phrase in open(path): stem_phrase = [] words = phrase.replace('\n', '').lower().split(" ") for w in words: stem_w = stemmer.stem_word(w) stem_phrase.append(stem_w) keyphrases.append(" ".join(stem_phrase)) return keyphrases
def words_filter(text, words, ori_words): count = 0 reg = re.compile(r'([!"#%&()*+,-./:;<=>?@\[\\\]^_`{|}~])', re.IGNORECASE) text = re.sub(reg, '#', text) stemmer = porter.PorterStemmer() for word in text.split(" "): count += 1 if word == '#': ori_words.append("#") else: if word != " " and word != "": if word not in stem_map: stem_map[word] = stemmer.stem_word(word) words.append(stem_map[word]) ori_words.append(word) return count
def coref_pos_filter(text, words, ori_words): output = NLP_SERVER.annotate(text, properties={ 'annotators': 'coref', 'outputFormat': 'json', }) # The standard Lucene stopword stemmer = porter.PorterStemmer() count = 0 for sentence in output['sentences']: #print sentence['index'] for word in sentence['tokens']: count += 1 if word['pos'] in ['SYM']: ori_words.append('#') else: if word['word'] not in stem_map: stem_map[word['word']] = stemmer.stem_word(word['word']) words.append(stem_map[word['word']]) ori_words.append(word['word']) return count