def extract_data(input_file_name, output_file_names): sh.unzip("-o", input_file_name) # This also updates timestamps. Ruffus doesn't recognize these files as complete results unles the # timestamp is up to date. sh.mv("testdata.manual.2009.06.14.csv", "sentiment140.test.csv") sh.mv("training.1600000.processed.noemoticon.csv", "sentiment140.train.csv") # Re-encode the files as utf8. They look like utf8 already (e.g. file thinks they're utf8) # but they are actually encoded as latin1. This doesn't make a difference for the test data # (the utf8 and latin1 encoded test data are identical files) but the train data has some # byte sequences that are invalid utf8 and this makes simplejson really upset. for output_file in output_file_names: sh.mv(output_file, "temp") sh.iconv("-f", "latin1", "-t", "utf8", "temp", _out=output_file) sh.rm("temp")
def import_csv(self, username, userpass, filestream): retCode = True import_xml_command = sh.Command("/usr/share/ds-matricula-plugin/matricula-common-scripts/1-itaca2mysql") # set a temporary filename TMP_CSV = tempfile.mkstemp()[1] TMP_CSV2 = tempfile.mkstemp()[1] if self._put_file(TMP_CSV, filestream): TMP_XML = tempfile.mkstemp()[1] sh.sed(sh.iconv("-f", "ISO-8859-15", "-t", "UTF-8", TMP_CSV), "-e", "1{s%[[:blank:]]%%g;s%\.%%g;s%[ñÑ]%n%g;s%.*%\L&%}", _out=TMP_CSV2) if self._csv2xml(TMP_CSV2, TMP_XML): try: import_xml_command(TMP_XML) except ErrorReturnCode: # some error happened retCode = False os.remove(TMP_XML) os.remove(TMP_CSV) os.remove(TMP_CSV2) return retCode
def process(self): if not self.infile: logging.fatal("no input specified") if self.pro_type == 'line': with open(self.infile) as inf, open(self.outfile, 'ab') as of, open(mid_datadir + self.lang + '/noquerys_' + file_suffix, 'ab') as noqueryfile: for line in inf: text = self.line_process(line.strip(), self.infile) if text: of.write(text + '\n') else: noqueryfile.write(line.strip() + '\n') elif self.pro_type in ['block', 'recall']: if len(self.operators): self.operators[0](self.infile, self.outfile) elif self.pro_type == 'analysis': if len(self.operators): with open(self.infile) as inf, open(self.outfile, 'ab') as of: for line in inf: text = self.analysis_process(line) of.write(text.strip() + '\n') elif self.pro_type == 'stem': self.load_stem() self.open_temps = [] self.in_files = [] self.out_files = [] self.args = [] for i in xrange(self.cpu_num): f = open(mid_datadir + self.lang + '/tempin' + str(i), 'wb') self.open_temps.append(f) self.in_files.append( mid_datadir + self.lang + '/tempin' + str(i)) self.out_files.append( mid_datadir + self.lang + '/tempout' + str(i)) for i in xrange(self.cpu_num): self.args.append( (self.in_files[i], self.out_files[i], self.lang, self.stems, self.operators[0])) with open(self.infile) as f: for i, line in enumerate(f): self.open_temps[i % self.cpu_num].write(line.strip() + '\n') for i in xrange(self.cpu_num): self.open_temps[i].close() pool = multiprocessing.Pool(self.cpu_num) pool.map(inner_process, self.args) sh.cat(self.out_files, _out=self.outfile) sh.rm('-rf', self.in_files) sh.rm('-rf', self.out_files) elif self.pro_type == 'dictgen': tmp1 = mid_datadir + self.lang + '/temp1' tmp2 = mid_datadir + self.lang + '/temp2' tmp3 = mid_datadir + self.lang + '/temp3' sh.cat(self.infile, _out=tmp1) with open(tmp1) as inf, open(tmp2, 'wb') as outf: for line in inf: text = line.strip().split('\t') outf.write( '\t'.join([text[0], gentime_type[text[1]][0], text[2], gentime_type[text[1]][1], '\n'])) sh.iconv('-f', 'utf8', '-t', 'gb18030', tmp2, _out=tmp3) sh.createbin( '-n', 'temp3', '-N', 'fanshixiao_acdict', '-p', mid_datadir + self.lang + '/', '-P', dict_dir + self.lang + '/', '-f', '%z%d%d%d', '-k', '0', '-t', '1', '-m', '10000000') sh.rm('-rf', [tmp3, tmp1]) sh.mv(tmp2, self.outfile)
def strip_invalid_utf8(str): return sh.iconv(str, "-c", "-t", "UTF-8")