def load_file(filename): corpus = [] doc_ids = [] event_list = [] fname_total = '%s/%s' % (ip_dir, filename) #print 'fname_total : ', fname_total f = open(fname_total, 'rb') story_dic = pickle.load(f) f.close() total_no_word = 0 for story in sorted(story_dic): #print(story_dic[story]['NER'].keys()) temp_doc = [] for item in ['PER', 'LOC', 'ORG', 'ONS']: temp_doc += story_dic[story]['NER']['TITLE_CONTENT'][item] doc_id = story.strip('.html').strip('.htm') event_id = '_'.join(doc_id.split('_')[:2]) if len(temp_doc) > 0: corpus.append(temp_doc) doc_ids.append(doc_id) total_no_word += len(temp_doc) if event_id not in event_list: event_list.append(event_id) f.close() return corpus, doc_ids, event_list, total_no_word
def load_file(filename): corpus = [] with codecs.open(filename, encoding='utf8') as freader: for line in freader: doc = line.strip().split(',') corpus.append(doc) return corpus
def load_sentences(sentences): corpus = [] for line in sentences: doc = re.findall(r'\w+(?:\'\w+)?',line) if len(doc)>0: corpus.append(doc) return corpus
def load_dataframe_jp(documents): corpus = [] tagger = MeCab.Tagger('-O wakati') tagger.parse("") for doc in documents: tokens = tagger.parse(doc.strip()).split() corpus.append(tokens) return corpus
def load_dataframe(documents): corpus = [] for doc in documents: sentences = re.findall(r'\w+(?:\'\w+)?', doc) if len(sentences) > 0: corpus.append(sentences) return corpus
def load_file(filename): corpus = [] f = open(filename, 'r') for line in f: doc = re.findall(r'\w+(?:\'\w+)?',line) if len(doc)>0: corpus.append(doc) f.close() return corpus
def load_file(filename): corpus = [] f = open(filename, 'r') for line in f: doc = re.findall(r'\w+(?:\'\w+)?', line) if len(doc) > 0: corpus.append(doc) f.close() return corpus
def load_file(filename): """ for one file one line corresponds to one doc """ corpus = [] f = open(filename, 'r') for line in f: doc = re.findall(r'\w+(?:\'\w+)?', line) if len(doc) > 0: corpus.append(doc) f.close() return corpus
def load_file_reuter(filename): corpus = [] doc_ids = [] event_list = [] fname_total = '%s/%s' % (ip_dir, filename) #print 'fname_total : ', fname_total f = open(fname_total, 'rb') story_dic = pickle.load(f) f.close() total_no_word = 0 for story in sorted(story_dic): event_id = story_dic[story]['topic'] corpus.append(story_dic[story]['content']) doc_ids.append(story) if event_id not in event_list: event_list.append(event_id) f.close() return corpus, doc_ids, event_list, total_no_word
def load_file(filename): corpus = [] doc_ids = [] event_list = [] f = open(filename, 'r') story_dic = pickle.load(f) f.close() for story in sorted(story_dic): title_content = story_dic[story]['CONTENT'] + story_dic[story]['TITLE'] doc = re.findall(r'\w+(?:\'\w+)?', title_content) # tokenizing here doc_id = story.strip('.html').strip('.htm') event_id = '_'.join(doc_id.split('_')[:2]) #print 'doc in load file ' , doc if len(doc) > 0: corpus.append(doc) doc_ids.append(doc_id) if event_id not in event_list: event_list.append(event_id) f.close() return corpus, doc_ids, event_list
def load_file(filename, format=False): corpus = [] if format == False: f = open(filename, 'r') for line in f: doc = re.findall(r'\w+(?:\'\w+)?',line) if len(doc)>0: corpus.append(doc) f.close() return corpus else: f = codecs.open(filename, 'r', 'utf-8') # f = open(filename, 'r') lines = f.readlines() i = 0 pre_doc = "" for line in lines: # doc = re.findall(r'\w+(?:\'\w+)?',line) # doc = line.encode("utf-8").split(" ") doc = re.findall(r'\w+(?:\'\w+)?',line.encode("utf-8")) if len(doc)>0: corpus.append(doc) pre_doc = doc else: corpus.append(pre_doc) f.close() return corpus