def write_to_file(disaster_name, single_disaster_data, dest_file, process, stopword_handler): with open(dest_file, 'wb') as f: spamwriter = csv.writer(f) for instance_name in single_disaster_data: for file_id in single_disaster_data[instance_name]: file_text = single_disaster_data[instance_name][file_id] if process: file_text = stopword_handler.remove_stopwords(file_text) new_text = "" for term in re.findall("\w+", file_text.lower()): new_text += " %s" % (stem(term)) file_text = new_text file_id = "%s_%s" % (instance_name, file_id) labels = " ".join([disaster_name, instance_name]) spamwriter.writerow([file_id, labels, file_text])
def __update_stemmed_model(self, text_string=None, text_list=None, text_dict=None, input_stemmed=False): #if not self.validate_input(text_string,text_list,text_dict): # raise TooManyInput if text_string: # for w in re.findall("\w+",text_string.lower()) : # if not input_stemmed: # w = stem(w) # if w not in self._model: # self._model[w] = 0 # self._model[w] += 1 temp_list = re.findall("\w+", text_string.lower()) if not input_stemmed: temp_list = map(stem, temp_list) self._model.update(temp_list) if text_list: # for w in text_list: # if not input_stemmed: # w = stem(w.lower()) # if w not in self._model: # self._model[w] = 0 # self._model[w] += 1 if not input_stemmed: text_list = map(stem, text_list) self._model.update(text_list) if text_dict: for w in text_dict: if not input_stemmed: stemmed_w = stem(w.lower()) self._model[stemmed_w] += text_dict[w] #if w not in self._model: # self._model[w] = 0 else: self._model[w] += text_dict[w] self._normalized = False
def do_stem(matchobj): return stem(matchobj.group(0))