def find_rest_t1(case_dir): """Search for the AC-PC Aligned T1 image that will be used for segmentation and partial volume correction""" os.chdir(case_dir) os.mkdir('T1_Segmentation'); T1_dir = case_dir + '/' + 'T1_Segmentation'; for fname in os.listdir(case_dir): if fname.startswith('MP-LAS-long') and fname.endswith('.nii'): T1File = fname; shutil.move(T1File, T1_dir); break; elif fname.startswith('MP-LAS-long') and fname.endswith('.zip'): T1File = fname; shutil.move(T1File, T1_dir); os.chdir(T1_dir); with zf(T1File) as zf_name: zf_name.extractall(); os.system('dcm2nii *'); break; elif fname.startswith('MP-LAS') and fname.endswith('.nii'): T1File = fname; shutil.move(T1File, T1_dir); break; elif fname.startswith('MP-LAS') and fname.endswith('.zip'): T1File = fname; shutil.move(T1File, T1_dir); os.chdir(T1_dir); with zf(T1File) as zf_name: zf_name.extractall(); os.system('dcm2nii *'); break; return T1_dir;
def download_job(job_id): url = base_url + '/job/' + job_id try: if not os.path.isdir("job/"): os.mkdir("job") os.mkdir("job/" + job_id) except: updateStatus("There was a problem while creating job folder", 0) return False try: updateStatus("Downloading job", 10) file_name = "job/job_" + job_id + ".zip" u = urllib2.urlopen(url) f = open(file_name, 'wb') context = u.read() f.write(context) f.close() except: updateStatus("There was a problem while downloading the job.", 0) return False try: z = zf("job/job_" + job_id + ".zip") z.extractall("job/" + job_id) os.remove("job/job_" + job_id + ".zip") except: updateStatus("Downloaded file is broken", 20) return False return True
def zip_results(): try: updateStatus('Compressing results', 80) z = zf("result.zip", mode="w") z.write("result.bam") z.write("result.bam.bai") os.remove("result.bam") os.remove("result.bam.bai") return True except: return False
def job_upload(): result = request.files["result_file"] if not result: return "No file uploaded" job_id = request.form["job_id"] if not job_id: return "No job_id specified" path = os.path.join(conf.get("folders", "result"), str(job_id)) if not os.path.exists(path): os.makedirs(path) filename = secure_filename(result.filename) fullpath = os.path.join(path, filename) result.save(fullpath) try: p = zf(fullpath) p.extractall(path) print "Unzipped the file" bam_file = os.path.join(path, "result.bam") cur = conn.cursor() cur.execute("update job set result_file = %s where job_id = %s", (bam_file, job_id)) conn.commit() call([conf.get("executables", "demux"), job_id, conf.get("files", "muxkey"), conf.get("folders", "groups")]) print "finished demux, checking completed groups" cur.execute( "select group_id from fastq_file_group where (select count(*) from job where job.group_id = fastq_file_group.group_id and completed = false) = 0 and merged = false;") group = cur.fetchone() if group != None: group_id = group[0] print "found group ", group_id cur.execute("select file_id from fastq_file where group_id = %s;", (group_id,)) while True: file = cur.fetchone() if file == None: break file_id = file[0] print "merging FASTQ" + str(file_id) group_path = os.path.join(conf.get("folders", "groups"), "FASTQ" + str(file_id)) output = os.path.join(group_path, "result.bam") files = os.path.join(group_path, "*") print group_path print output print files call("samtools merge " + output + " " + files, shell=True) call("samtools index " + output, shell=True) cur.execute("update fastq_file_group set merged = true where group_id = %s", (group_id,)) conn.commit() cur.close() print "merged group", group_id print "no groups to merge" except: return "Error" return "Done"
def unzip_rest_get_dir(rest_rawz_dir, case_dir): # Function unzips the raw asl dicom dir and returns new unzipped dir. with zf(rest_rawz_dir) as zf_dir: zf_dir.extractall('rsfMRI-raw') for fname in os.listdir(case_dir + '/' + 'rsfMRI-raw'): rest_sub_dir = fname; # Create a variable for the dicom directory dicomdir = case_dir + '/' + 'rsfMRI-raw' + '/' + rest_sub_dir; return dicomdir;
def read_files(self, cr, uid, filename): staff_file = 'staff.csv' contract_file = 'contrat.csv' job_file = 'fonction.csv' job_reader =False contract_reader = False staff_reader = False desc_to_close = [] tmpdir = False if is_zipfile(filename): zipobj = zf(filename) if zipobj.namelist() and job_file in zipobj.namelist(): job_reader = csv.DictReader(zipobj.open(job_file), quotechar='"', delimiter=',', doublequote=False, escapechar='\\') # Do not raise error for job file because it's just a useful piece of data, but not more. # read the contract file if zipobj.namelist() and contract_file in zipobj.namelist(): contract_reader = csv.DictReader(zipobj.open(contract_file), quotechar='"', delimiter=',', doublequote=False, escapechar='\\') # read the staff file if zipobj.namelist() and staff_file in zipobj.namelist(): # Doublequote and escapechar avoid some problems staff_reader = csv.DictReader(zipobj.open(staff_file), quotechar='"', delimiter=',', doublequote=False, escapechar='\\') else: tmpdir = self._extract_7z(cr, uid, filename) job_file_name = os.path.join(tmpdir, job_file) if os.path.isfile(job_file_name): job_file_desc = open(job_file_name, 'rb') desc_to_close.append(job_file_desc) job_reader = csv.DictReader(job_file_desc, quotechar='"', delimiter=',', doublequote=False, escapechar='\\') contract_file_name = os.path.join(tmpdir, contract_file) if os.path.isfile(contract_file_name): contract_file_desc = open(contract_file_name, 'rb') desc_to_close.append(contract_file_desc) contract_reader = csv.DictReader(contract_file_desc, quotechar='"', delimiter=',', doublequote=False, escapechar='\\') staff_file_name = os.path.join(tmpdir, staff_file) if os.path.isfile(staff_file_name): staff_file_desc = open(staff_file_name, 'rb') desc_to_close.append(staff_file_desc) staff_reader = csv.DictReader(staff_file_desc, quotechar='"', delimiter=',', doublequote=False, escapechar='\\') if not contract_reader: raise osv.except_osv(_('Error'), _('%s not found in given zip file!') % (contract_file,)) if not staff_reader: raise osv.except_osv(_('Error'), _('%s not found in given zip file!') % (staff_file,)) return (job_reader, contract_reader, staff_reader, desc_to_close, tmpdir)
def bls_cew_consolidate(fdDir): """Consolidate downloaded BLS CEW data.""" d = check_directory_consolidate(fdDir.joinpath('bls/cew')) zips = d.glob('*.zip') csvfile = d / 'data.csv' dtypes = get_bls_dtypes(bls_cew) header = True # write header only once with csvfile.open('a') as f: for z in zips: qprint('Consolidating {0}...'.format(str(z).split('/')[-1]), end="\r") with zf(str(z), 'r') as zfile: csvs = (csv for csv in zfile.namelist() if re.search(r'all industries.csv', csv)) for csv in csvs: for chunk in pd.read_csv(zfile.open(csv), chunksize=10000): if False: # TODO consolidate only fips rows of CSVs # TODO need have fips.csv on hand fips = pd.read_csv(fdDir + "fips.csv") chunk = chunk[chunk.area_fips.isin(fips.fips)] # fix incorrectly named column try: chunk.rename( columns={'oty_taxable_qtrly_wages_chg.1': 'oty_taxable_qtrly_wages_pct', }, inplace=True) except KeyError: pass # make data types match across chunks chunk = convert_dtypes(chunk, dtypes) chunk.to_csv(f, header=header, index=False, float_format='%.2f') header = False qprint("bls:cew data consolidated\x1b[K.")
def main(): import codecs parser = argparse.ArgumentParser(description="Print monolingual data") parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('r'), default=[sys.stdin,], help="input zip file(s) (each contains a multi file)") # parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file (single text file)") parser.add_argument("--outfile", "-o", help="output file (single text file)") parser.add_argument("--xml", "-x", action='store_true', help="process ltf xml files") parser.add_argument("--tokenize", action='store_true', help="use tokens (only applies if -x)") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) of = codecs.open(args.outfile, 'w', 'utf-8') for infile in args.infile: archive = zf(infile) for info in archive.infolist(): if info.file_size < 20: continue # plain processing assumes rsd structure if not args.xml and os.path.dirname(info.filename) != 'rsd': continue # print info.filename with archive.open(info, 'rU') as ifh: if args.xml: xobj = ET.parse(ifh) if args.tokenize: of.writelines([ ' '.join([ y.text for y in x.findall(".//TOKEN") ])+"\n" for x in xobj.findall(".//SEG") ]) else: of.writelines([ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ]) else: lines = ifh.readlines() for line in lines: of.write(line.decode('utf8'))
def main(): import codecs parser = argparse.ArgumentParser( description="Extract and print psm annotat" "ion data from LRLP in a form that is amen" "able to insertion into future xml", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--infile", "-i", nargs="+", type=argparse.FileType("rb"), default=[sys.stdin], help="input zip file(s)" " (each contains a multi file)", ) parser.add_argument( "--outfile", "-o", type=argparse.FileType("w"), default=sys.stdout, help="where to write extracted semantic info", ) try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) outfile = args.outfile nonehash = {"value": "None"} for infile in args.infile: inbase = ".".join(os.path.basename(infile.name).split(".")[:-2]) archive = zf(infile) for info in archive.infolist(): if info.file_size < 20: continue # Assume psm structure if os.path.dirname(info.filename) != "psm": continue with archive.open(info, "rU") as ifh: xobj = ET.parse(ifh) try: headlines = [ (x.get("begin_offset"), x.get("char_length")) for x in xobj.findall("string[@type='headline']") ] # TODO: funornone this back into functional postnodes = xobj.findall("string[@type='post']") posts = [] for x in postnodes: post = [] anode = x.find("attribute[@name='author']") if anode is None: anode = nonehash dnode = x.find("attribute[@name='datetime']") if dnode is None: dnode = nonehash posts.append( (x.get("begin_offset"), x.get("char_length"), anode.get("value"), dnode.get("value")) ) except: print(info.filename) raise sys.exit(1) # GENRE/LANG/DATE info will be gleaned from filename later. # assume psm.xml and strip it off fname = os.path.basename(info.filename).split(".psm.xml")[0] for h in headlines: outfile.write("\t".join(("headline", fname) + h) + "\n") for p in posts: outfile.write("\t".join(("post", fname) + p) + "\n")
def main(): parser = argparse.ArgumentParser(description="Extract and print monolingual" \ " data, tokenized, morph, pos tag and " \ "original, with manifests") parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('rb'), default=[sys.stdin,], help="input zip file(s) (each contains a multi file)") parser.add_argument("--outdir", "-o", help="where to write extracted files") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="tokenized", help="subdirectory for tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological information") parser.add_argument("--origsubdir", default="original", help="subdirectory for untokenized files") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir, "cdectok.sh"), help="cdec tokenizer program wrapper") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) tokoutdir=os.path.join(args.outdir, args.toksubdir) origoutdir=os.path.join(args.outdir, args.origsubdir) cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir) morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir) morphoutdir=os.path.join(args.outdir, args.morphsubdir) posoutdir=os.path.join(args.outdir, args.possubdir) dirs = [args.outdir, tokoutdir, cdectokoutdir, origoutdir, morphtokoutdir, morphoutdir, posoutdir] if args.nogarbage: garbageoutdir = None else: garbageoutdir=os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) defaultcount=0 for infile in args.infile: inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2]) if len(inbase) == 0: inbase="default.%d" % defaultcount defaultcount+=1 archive = zf(infile) man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w') orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w') if args.nogarbage: garbage_fh = None garbage_man_fh = None else: garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w') garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w') tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w') morphtok_fh = open(os.path.join(morphtokoutdir, "%s.flat" % inbase), 'w') morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w') pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w') for info in archive.infolist(): if info.file_size < 20: continue # assume ltf filename if not info.filename.endswith("ltf.xml"): continue # print info.filename with archive.open(info, 'rU') as ifh: try: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id') origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ] garbagemask = getgarbagemask(origlines, disabled=args.nogarbage) goodmask = [not x for x in garbagemask] seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ] for x in xobj.findall(".//SEG") ] for line in compress(origlines, garbagemask): orig_fh.write(line) for tup in compress(seginfo, garbagemask): man_fh.write("\t".join(map(str, [info.filename,docid]+tup))+"\n") if not args.nogarbage: for line in compress(origlines, goodmask): garbage_fh.write(line) for tup in compress(seginfo, goodmask): garbage_man_fh.write("\t".join(map(str, [info.filename,docid]+tup))+"\n") for x in compress(xobj.findall(".//SEG"), garbagemask): tokens = x.findall(".//TOKEN") toktext = [] morphtoktext = [] morphtext = [] postext = [] for y in tokens: if y.text is None: continue toktext.append(y.text) postext.append(y.get("pos") or "none") for mt, mtt in morph_tok(y): morphtext.append(mt) morphtoktext.append(mtt) tok_fh.write(' '.join(toktext)+"\n") morphtok_fh.write(' '.join(morphtoktext)+"\n") morph_fh.write(' '.join(morphtext)+"\n") pos_fh.write(' '.join(postext)+"\n") except ET.ParseError: sys.stderr.write("Parse error on "+ifh.name+"\n") continue orig_fh.close() cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer, orig_fh.name, os.path.join(cdectokoutdir, "%s.flat.lc" % inbase), os.path.join(cdectokoutdir, "%s.flat" % inbase)) p = subprocess.Popen(shlex.split(cdec_cmd)) p.wait()
def distance(distance, *args): if isinstance(distance, str): distance_list = [distance] elif isinstance(distance, list): distance_list = distance else: raise Exception( "Unknown distance type. Provide a name (str) or a list of str.") for dist in distance_list: if dist not in DISTANCES: raise Exception("Unknown distance " + dist + ". The available ones are: " + ' '.join(DISTANCES)) if len(args) == 1 and not isinstance(args[0], list): raise Exception( "Error: You only provided one language argument.\nProvide multiple language arguments, or a single list of languages as arguments." ) if len(args) == 1 and isinstance(args[0], list): langs = args[0] else: langs = [l for l in args] for l in langs: if l not in DISTANCE_LANGUAGES: raise Exception( "Unknown language " + l + " (or maybe we don't have precomputed distances for this one)." ) indeces = [DISTANCE_LANGUAGES.index(l) for l in langs] N = len(indeces) if N == 2: out = [] with zf(DISTANCES_FILE, 'r') as zp: for dist in distance_list: data = sparse.load_npz(zp.open(map_distance_to_filename(dist))) if indeces[0] > indeces[1]: out.append(data[indeces[1], indeces[0]]) else: out.append(data[indeces[0], indeces[1]]) if len(out) > 1: return out else: return out[0] else: arr_list = [np.zeros((N, N)) for dist in distance_list] with zf(DISTANCES_FILE, 'r') as zp: for k, dist in enumerate(distance_list): data = sparse.load_npz(zp.open(map_distance_to_filename(dist))) for a, i in enumerate(indeces): for b, j in enumerate(indeces): if a != b: if i > j: arr_list[k][a, b] = data[j, i] else: arr_list[k][a, b] = data[i, j] if len(arr_list) > 1: return arr_list else: return arr_list[0]
def import_data(year): t1_time = t.time() year = str(year) # force into a string # create import folder if not available if path.exists(gv.data_dir): pass else: mkdir(gv.data_dir) # create landing folder if not available if path.exists(gv.data_dir + '/landing'): pass else: mkdir(gv.data_dir + '/landing') # download file into import/landing folder url = 'https://www.retrosheet.org/events/' # year = sys.argv[1] zip_file = year + 'eve.zip' urllib.request.urlretrieve(url + zip_file, gv.data_dir + '/landing/' + zip_file) # create new folder for the unzipped contents if path.exists(gv.data_dir + '/' + year): pass else: mkdir(gv.data_dir + '/' + year) # unzip contents to the year folder try: with zf(gv.data_dir + '/landing/' + zip_file) as unzip: unzip.extractall(gv.data_dir + '/' + year) except Exception as e: # accept any types of errors el.error_logger(e, 'unzipping import year: ' + str(e), None, year, '') return False # remove landing file try: if path.exists(gv.data_dir + '/landing/' + zip_file): remove(gv.data_dir + '/landing/' + zip_file) except Exception as e: # accept any types of errors el.error_logger(e, 'removing landing file: ' + str(e), None, year, '') return False t2_time = t.time() # send completion notice conn = dbs.engine.connect() conn.fast_executemany = True finish_str = { 'process_name': 'import_year', 'data_year': year, 'team_name': '---', 'time_elapsed': t2_time - t1_time, 'timestamp': t.strftime("%Y-%m-%d %H:%M:%S", t.localtime()) } completion = pd.DataFrame([finish_str]) completion.to_sql('process_log', conn, if_exists='append', index=False) return True
def convertToCSV(inp): csvLogFile = open('/home/aking/misc/pythonscraper/sec/logCSVFileConversion.log', 'a') subprocess.call(['sh', '/home/aking/misc/pythonscraper/sec/convertToCSV.sh', inp], stdout=csvLogFile) csvLogFile.close() def ss(str): return r.content[((r.content).rfind('\n',0,(r.content).find(str)))+1:((r.content).find('\n',(r.content).find(str)))] r = requests.get('http://www.sec.gov/foia/iareports/inva-archive.htm') fileExt = ss(d.strftime("%B %Y"))[ss(d.strftime("%B %Y")).index('/foia'):ss(d.strftime("%B %Y")).index('.zip')+4] filePath = "/home/aking/misc/pythonscraper/sec/tmp/" #filePath = sys.argv[2] result=None while result is None: try: r = requests.get('http://www.sec.gov'+fileExt) z = zf(sio(r.content)) z.extractall(filePath) fileNames = map(convertName, z.namelist()) map(convertToCSV, fileNames) result = "SUCCESS" logging.info(result + ": REQUEST SUCCESSFULLY HANDLED AT: " + d.strftime("%Y-%m-%d:%H:%M:%S")) except: logging.info("ERROR: COULD NOT HANDLE REQUEST...RETRYING AT : " + d.strftime("%Y-%m-%d:%H:%M:%S")) sleep(60) pass
def main(): import codecs parser = argparse.ArgumentParser( description="Extract lexicon file from xml") parser.add_argument("--infiles", "-i", nargs='+', type=argparse.FileType('r'), help="input lexicon files") parser.add_argument("--outfile", "-o", help="output file") parser.add_argument("--version", "-v", choices=["1.4", "1.5", "il3", "il5", "il6"], default="1.5", help="dtd version") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) outdir = os.path.dirname(args.outfile) if not os.path.exists(outdir): os.makedirs(outdir) outfile = args.outfile poslabel = "POS" if args.version == "1.4": entrylabel = "ENTRY" wordlabel = "WORD" glosslabel = "GLOSS" dopos = True elif args.version == "1.5": entrylabel = "ENTRY" wordlabel = "LEMMA" glosslabel = "GLOSS" dopos = True elif args.version == "il3": entrylabel = "ENTRY" wordlabel = "WORD" glosslabel = "DEFINITION" dopos = False elif args.version == "il6": entrylabel = "Entry" wordlabel = "FormRep" glosslabel = "Equiv" poslabel = "{http://www.ormld.com/OromoLanguageData/}POS" dopos = True else: pass # for printing out at the end stats = 0 of = codecs.open(outfile, 'w', 'utf-8') source_fh = open(os.path.join(outdir, "source"), 'a') infiles = args.infiles if args.version == "il6": neofiles = [] for infile in infiles: archive = zf(infile.name) for info in archive.infolist(): if info.file_size < 20: continue neofiles.append(TextIOWrapper(archive.open(info, 'r'))) infiles = neofiles if args.version == "il5": for infile in infiles: for line in infile: toks = line.strip().split() of.write("{}\tUNK\t{}\n".format(' '.join(toks[1:]), toks[0])) stats += 1 else: for infile in infiles: xobj = ET.parse(infile) try: entrysearch = ".//{}".format(entrylabel) for entry in xobj.findall(entrysearch): # POS hacked out and GLOSS->DEFINITION for IL words = entry.findall(".//%s" % wordlabel) possearch = ".//{}".format(poslabel) poses = [x.text for x in entry.findall(possearch)] if dopos else [ "UNK", ] glosses = entry.findall(".//%s" % glosslabel) if len(poses) != len(glosses): if len(poses) == 1: poses = [poses[0]] * len(glosses) elif len(poses) == 0: poses = ["UNK"] * len(glosses) else: sys.stderr.write("{} poses\n".format(len(poses))) raise SkipEntry(ET.dump(entry)) for word in words: for pos, gloss in zip(poses, glosses): if gloss.text is None or word.text is None or pos is None: continue stats += 1 of.write("%s\t%s\t%s\n" % (word.text.strip(), pos.strip(), gloss.text.strip())) except SkipEntry as e: raise source_fh.write("Extracted lexicon from %s to %s on %s\nusing %s; command" \ " issued from %s\n" % (infile.name, outfile, datetime.datetime.now(), ' '.join(sys.argv), os.getcwd())) # copy all files from lexicon directory to processed directory lexicon_dirs = set([os.path.dirname(x.name) for x in args.infiles]) sys.stderr.write("Extracted %d entries\n" % (stats)) for lexicon_dir in lexicon_dirs: for i in os.listdir(lexicon_dir): name = os.path.join(lexicon_dir, i) outname = '%s_%s' % (outfile, i) shutil.copy(name, outname) source_fh.write("Extracted extra lexicon from %s to %s\n" % (name, outname))
morphoutdir=os.path.join(args.outdir, args.morphsubdir) dirs = [args.outdir, tokoutdir, cdectokoutdir, origoutdir, morphtokoutdir, morphoutdir] for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) for infile in args.infile: inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2]) archive = zf(infile) man_fh = writer(open(os.path.join(args.outdir, "%s.manifest" % inbase), 'w')) orig_fh = writer(open(os.path.join(origoutdir, "%s.flat" % inbase), 'w')) tok_fh = writer(open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w')) morphtok_fh = writer(open(os.path.join(morphtokoutdir, "%s.flat" % inbase), 'w')) morph_fh = writer(open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w')) for info in archive.infolist(): if info.file_size < 20: continue # assume ltf structure if os.path.dirname(info.filename) != 'ltf': continue # print info.filename with archive.open(info, 'rU') as ifh: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id')
def epa_ucmr_consolidate(fdDir): """Conslidate EPA UCMR data.""" # TODO double check this function; use less memory # < memory: merge other data, read/write/merge/append all3/all2 in chunks? d = check_directory_consolidate(fdDir.joinpath('epa/ucmr')) qprint('Consolidating {0}...'.format(d), end="\r") with zf(str(d/'ucmr-3-occurrence-data.zip'), 'r') as zfile3, zf(str(d/'ucmr2_occurrencedata_jan12.zip'), 'r') as zfile2: all3 = pd.read_table( zfile3.open('UCMR3_All.txt'), encoding='latin1', dtype={ 'PWSID': str, 'PWSName': str, 'Size': str, 'FacilityID': str, 'FacilityName': str, 'FacilityWaterType': str, 'SamplePointID': str, 'SamplePointName': str, 'SamplePointType': str, 'AssociatedFacilityID': str, 'AssociatedSamplePointID': str, 'CollectionDate': str, 'SampleID': str, 'Contaminant': str, 'MRL': float, 'MethodID': str, 'AnalyticalResultsSign': str, 'AnalyticalResultValue': float, 'SampleEventCode': str, 'MonitoringRequirement': str, 'Region': str, 'State': str, } ) drt = pd.read_table( zfile3.open('UCMR3_DRT.txt'), encoding='latin1', dtype={ 'PWSID': str, 'FacilityID': str, 'SamplePointID': str, 'SampleEventCode': str, 'CollectionDate': str, 'Disinfectant Type': str, } ) all3 = pd.merge( all3, drt, how='left', on=[ 'PWSID', 'FacilityID', 'SamplePointID', 'CollectionDate', ] ) del drt zipcodes = pd.read_table( zfile3.open('UCMR3_ZipCodes.txt'), encoding='latin1', dtype={ 'PWSID': str, 'ZIPCODE': str, }) all3 = pd.merge(all3, zipcodes, how='left', on='PWSID') all2 = pd.read_table( zfile2.open('UCMR2_All_OccurrenceData_Jan12.txt'), encoding='latin1', dtype={ 'PWSID': str, 'PWSName': str, 'Size': str, 'FacilityID': str, 'FacilityName': str, 'FacilityWaterType': str, 'SamplePointID': str, 'SamplePointName': str, 'SamplePointType': str, 'AssociatedFacilityID': str, 'AssociatedSamplePointID': str, 'DisinfectantType': str, 'CollectionDate': str, 'SampleID': str, 'Contaminant': str, 'MRL': float, 'MethodID': str, 'AnalyticalResultsSign': str, 'AnalyticalResultValue': float, 'SampleEventCode': str, 'MonitoringRequirement': str, 'Region': str, 'State': str, } ) all2 = pd.merge(all2, zipcodes, how='left', on='PWSID') all = all3.append(all2, ignore_index=True) del all3, all2 csvfile = d / 'data.csv' with csvfile.open('a') as f: all.to_csv(f, index=False, float_format='%.2f') qprint('epa:ucmr data consolidated\x1b[K.')
def _do_pass(self, cr, uid, ids, context=None): """ Open ZIP file, take the CSV file into and parse it to import payroll entries """ # Do verifications if not context: context = {} # Verify that no draft payroll entries exists line_ids = self.pool.get('hr.payroll.msf').search(cr, uid, [('state', '=', 'draft')]) if len(line_ids): raise osv.except_osv(_('Error'), _('You cannot import payroll entries. Please validate first draft payroll entries!')) # Prepare some values file_ext_separator = '.' file_ext = "csv" message = _("Payroll import failed.") res = False created = 0 processed = 0 header_vals = {} xyargv = self._get_homere_password(cr, uid, pass_type='payroll') filename = "" wiz_state = False # Browse all given wizard for wiz in self.browse(cr, uid, ids): if not wiz.file: raise osv.except_osv(_('Error'), _('Nothing to import.')) if not wiz_state: wiz_state = wiz.state # Decode file string fileobj = NamedTemporaryFile('w+b', delete=False) fileobj.write(decodestring(wiz.file)) # now we determine the file format filename = fileobj.name fileobj.close() try: zipobj = zf(filename, 'r') filename = wiz.filename or "" except: raise osv.except_osv(_('Error'), _('Given file is not a zip file!')) if zipobj.namelist(): namelist = zipobj.namelist() # Search CSV csvfile = None for name in namelist: if name.split(file_ext_separator) and name.split(file_ext_separator)[-1] == file_ext: csvfile = name if not 'envoi.ini' in namelist: raise osv.except_osv(_('Warning'), _('No envoi.ini file found in given ZIP file!')) # Read information from 'envoi.ini' file field = False try: import ConfigParser Config = ConfigParser.SafeConfigParser() Config.readfp(zipobj.open('envoi.ini', 'r', xyargv)) field = Config.get('DEFAUT', 'PAYS') except Exception, e: raise osv.except_osv(_('Error'), _('Could not read envoi.ini file in given ZIP file.')) if not field: raise osv.except_osv(_('Warning'), _('Field not found in envoi.ini file.')) # Read CSV file if csvfile: try: reader = csv.reader(zipobj.open(csvfile, 'r', xyargv), delimiter=';', quotechar='"', doublequote=False, escapechar='\\') reader.next() except: fileobj.close() raise osv.except_osv(_('Error'), _('Problem to read given file.')) res = True res_amount = 0.0 amount = 0.0 error_msg = "" for line in reader: processed += 1 update, amount, nb_created, vals, ccy, msg = self.update_payroll_entries( cr, uid, data=line, field=field, date_format=wiz.date_format, wiz_state=wiz.state) res_amount += round(amount, 2) if not update: res = False if created == 0: header_vals = vals header_vals['currency_code'] = ccy created += nb_created if msg: error_msg += "Line " + str(processed) + ": " + msg + " \n" # Check balance res_amount_rounded = round(res_amount, 2) if res_amount_rounded != 0.0: self._uf_side_rounding_line_check_gap(cr, uid, header_vals['currency_id'], header_vals['currency_code'], header_vals['date'], res_amount_rounded, context=context) # adapt difference by writing on payroll rounding line pr_ids = self.pool.get('hr.payroll.msf').search( cr, uid, [ ('state', '=', 'draft'), ('name', '=', 'Payroll rounding') ]) if not pr_ids: # no SAGA BALANCE rounding line in file # => create one UF side (US-201) if wiz.state == 'simu': self.write(cr, uid, [wiz.id], { 'state': 'proceed', 'msg': UF_SIDE_ROUNDING_LINE['msg_nb'] % ( res_amount_rounded, header_vals['currency_code'] , ) }) else: self._uf_side_rounding_line_create(cr, uid, ids, context=context, header_vals=header_vals, amount=-1 * res_amount_rounded) #raise osv.except_osv(_('Error'), _('An error occured on balance and no payroll rounding line found.')) else: # Fetch Payroll rounding amount line and update pr = self.pool.get('hr.payroll.msf').browse(cr, uid, pr_ids[0]) # To compute new amount, you should: # - take payroll rounding amount # - take the opposite of res_amount (wich is the current difference) # - add both new_amount = round(pr.amount, 2) + (-1 * res_amount_rounded) self.pool.get('hr.payroll.msf').write(cr, uid, pr_ids[0], {'amount': round(new_amount, 2),}) else: raise osv.except_osv(_('Error'), _('Right CSV is not present in this zip file. Please use "File > File sending > Monthly" in Homère.')) fileobj.close()
def extractallzip(rprt): for fold in os.listdir(rprt): if fnmatch.fnmatch(fold, '*.zip'): print('Extracting ' + fold + '...') zf(os.path.join(rprt + fold), 'r').extractall(rprt) print(fold + ' extracted !')