Example #1
0
def find_rest_t1(case_dir):

   """Search for the AC-PC Aligned T1 image that will be used for segmentation and partial volume correction"""
   
   os.chdir(case_dir)
   os.mkdir('T1_Segmentation');
   T1_dir = case_dir + '/' + 'T1_Segmentation';     
   for fname in os.listdir(case_dir):

      if fname.startswith('MP-LAS-long') and fname.endswith('.nii'):
         T1File = fname;
         shutil.move(T1File, T1_dir);
         break;
      elif fname.startswith('MP-LAS-long') and fname.endswith('.zip'):
         T1File = fname;
         shutil.move(T1File, T1_dir);
         os.chdir(T1_dir);
         with zf(T1File) as zf_name:
            zf_name.extractall();
         os.system('dcm2nii *');
         break;
      elif fname.startswith('MP-LAS') and fname.endswith('.nii'):
         T1File = fname;
         shutil.move(T1File, T1_dir);
         break;
      elif fname.startswith('MP-LAS') and fname.endswith('.zip'):
         T1File = fname;
         shutil.move(T1File, T1_dir);
         os.chdir(T1_dir);
         with zf(T1File) as zf_name:
            zf_name.extractall();
         os.system('dcm2nii *');
         break;

   return T1_dir;   
Example #2
0
def download_job(job_id):
    url = base_url + '/job/' + job_id

    try:
        if not os.path.isdir("job/"):
            os.mkdir("job")
        os.mkdir("job/" + job_id)
    except:
        updateStatus("There was a problem while creating job folder", 0)
        return False

    try:
        updateStatus("Downloading job", 10)
        file_name = "job/job_" + job_id + ".zip"
        u = urllib2.urlopen(url)
        f = open(file_name, 'wb')
        context = u.read()
        f.write(context)
        f.close()
    except:
        updateStatus("There was a problem while downloading the job.", 0)
        return False

    try:
        z = zf("job/job_" + job_id + ".zip")
        z.extractall("job/" + job_id)
        os.remove("job/job_" + job_id + ".zip")
    except:
        updateStatus("Downloaded file is broken", 20)
        return False

    return True
Example #3
0
def zip_results():
    try:
        updateStatus('Compressing results', 80)
        z = zf("result.zip", mode="w")
        z.write("result.bam")
        z.write("result.bam.bai")
        os.remove("result.bam")
        os.remove("result.bam.bai")
        return True
    except:
        return False
Example #4
0
def job_upload():
    result = request.files["result_file"]
    if not result:
        return "No file uploaded"
    job_id = request.form["job_id"]
    if not job_id:
        return "No job_id specified"
    path = os.path.join(conf.get("folders", "result"), str(job_id))
    if not os.path.exists(path):
        os.makedirs(path)
    filename = secure_filename(result.filename)
    fullpath = os.path.join(path, filename)
    result.save(fullpath)
    try:
        p = zf(fullpath)
        p.extractall(path)
        print "Unzipped the file"
        bam_file = os.path.join(path, "result.bam")
        cur = conn.cursor()
        cur.execute("update job set result_file = %s where job_id = %s", (bam_file, job_id))
        conn.commit()
        call([conf.get("executables", "demux"), job_id, conf.get("files", "muxkey"), conf.get("folders", "groups")])
        print "finished demux, checking completed groups"
        cur.execute(
            "select group_id from fastq_file_group where (select count(*) from job where job.group_id = fastq_file_group.group_id and completed = false) = 0 and merged = false;")
        group = cur.fetchone()
        if group != None:
            group_id = group[0]
            print "found group ", group_id
            cur.execute("select file_id from fastq_file where group_id = %s;", (group_id,))
            while True:
                file = cur.fetchone()
                if file == None:
                    break
                file_id = file[0]
                print "merging FASTQ" + str(file_id)
                group_path = os.path.join(conf.get("folders", "groups"), "FASTQ" + str(file_id))
                output = os.path.join(group_path, "result.bam")
                files = os.path.join(group_path, "*")
                print group_path
                print output
                print files
                call("samtools merge " + output + " " + files, shell=True)
                call("samtools index " + output, shell=True)
            cur.execute("update fastq_file_group set merged = true where group_id = %s", (group_id,))
            conn.commit()
            cur.close()
            print "merged group", group_id
        print "no groups to merge"
    except:
        return "Error"
    return "Done"
Example #5
0
def unzip_rest_get_dir(rest_rawz_dir, case_dir):

   # Function unzips the raw asl dicom dir and returns new unzipped dir.
   with zf(rest_rawz_dir) as zf_dir:
      zf_dir.extractall('rsfMRI-raw')

   for fname in os.listdir(case_dir + '/' + 'rsfMRI-raw'):
      rest_sub_dir = fname;
   
   # Create a variable for the dicom directory
   dicomdir = case_dir + '/' + 'rsfMRI-raw' + '/' + rest_sub_dir;

   return dicomdir; 
    def read_files(self, cr, uid, filename):
        staff_file = 'staff.csv'
        contract_file = 'contrat.csv'
        job_file = 'fonction.csv'
        job_reader =False
        contract_reader = False
        staff_reader = False
        desc_to_close = []
        tmpdir = False
        if is_zipfile(filename):
            zipobj = zf(filename)
            if zipobj.namelist() and job_file in zipobj.namelist():
                job_reader = csv.DictReader(zipobj.open(job_file), quotechar='"', delimiter=',', doublequote=False, escapechar='\\')
                # Do not raise error for job file because it's just a useful piece of data, but not more.
        # read the contract file
            if zipobj.namelist() and contract_file in zipobj.namelist():
                contract_reader = csv.DictReader(zipobj.open(contract_file), quotechar='"', delimiter=',', doublequote=False, escapechar='\\')
        # read the staff file
            if zipobj.namelist() and staff_file in zipobj.namelist():
                # Doublequote and escapechar avoid some problems
                staff_reader = csv.DictReader(zipobj.open(staff_file), quotechar='"', delimiter=',', doublequote=False, escapechar='\\')
        else:
            tmpdir = self._extract_7z(cr, uid, filename)
            job_file_name = os.path.join(tmpdir, job_file)
            if os.path.isfile(job_file_name):
                job_file_desc = open(job_file_name, 'rb')
                desc_to_close.append(job_file_desc)
                job_reader = csv.DictReader(job_file_desc, quotechar='"', delimiter=',', doublequote=False, escapechar='\\')

            contract_file_name = os.path.join(tmpdir, contract_file)
            if os.path.isfile(contract_file_name):
                contract_file_desc = open(contract_file_name, 'rb')
                desc_to_close.append(contract_file_desc)
                contract_reader = csv.DictReader(contract_file_desc, quotechar='"', delimiter=',', doublequote=False, escapechar='\\')

            staff_file_name = os.path.join(tmpdir, staff_file)
            if os.path.isfile(staff_file_name):
                staff_file_desc = open(staff_file_name, 'rb')
                desc_to_close.append(staff_file_desc)
                staff_reader = csv.DictReader(staff_file_desc, quotechar='"', delimiter=',', doublequote=False, escapechar='\\')

        if not contract_reader:
            raise osv.except_osv(_('Error'), _('%s not found in given zip file!') % (contract_file,))
        if not staff_reader:
            raise osv.except_osv(_('Error'), _('%s not found in given zip file!') % (staff_file,))
        return (job_reader, contract_reader, staff_reader, desc_to_close, tmpdir)
Example #7
0
File: fd.py Project: roualdes/fd
def bls_cew_consolidate(fdDir):
    """Consolidate downloaded BLS CEW data."""
    d = check_directory_consolidate(fdDir.joinpath('bls/cew'))
    zips = d.glob('*.zip')
    csvfile = d / 'data.csv'
    dtypes = get_bls_dtypes(bls_cew)
    header = True                 # write header only once

    with csvfile.open('a') as f:
        for z in zips:

            qprint('Consolidating {0}...'.format(str(z).split('/')[-1]),
                   end="\r")

            with zf(str(z), 'r') as zfile:
                csvs = (csv for csv in zfile.namelist()
                        if re.search(r'all industries.csv', csv))
                for csv in csvs:
                    for chunk in pd.read_csv(zfile.open(csv), chunksize=10000):
                        if False:
                            # TODO consolidate only fips rows of CSVs
                            # TODO need have fips.csv on hand
                            fips = pd.read_csv(fdDir + "fips.csv")
                            chunk = chunk[chunk.area_fips.isin(fips.fips)]

                        # fix incorrectly named column
                        try:
                            chunk.rename(
                                columns={'oty_taxable_qtrly_wages_chg.1':
                                         'oty_taxable_qtrly_wages_pct', },
                                inplace=True)
                        except KeyError:
                            pass

                        # make data types match across chunks
                        chunk = convert_dtypes(chunk, dtypes)
                        chunk.to_csv(f, header=header, index=False,
                                     float_format='%.2f')
                        header = False

    qprint("bls:cew data consolidated\x1b[K.")
Example #8
0
def main():
  import codecs
  parser = argparse.ArgumentParser(description="Print monolingual data")
  parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('r'), default=[sys.stdin,], help="input zip file(s) (each contains a multi file)")
  #  parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file (single text file)")
  parser.add_argument("--outfile", "-o", help="output file (single text file)")
  parser.add_argument("--xml", "-x", action='store_true', help="process ltf xml files")
  parser.add_argument("--tokenize", action='store_true', help="use tokens (only applies if -x)")


  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  of = codecs.open(args.outfile, 'w', 'utf-8')
  for infile in args.infile:
    archive = zf(infile)
    for info in archive.infolist():
      if info.file_size < 20:
        continue
      # plain processing assumes rsd structure
      if not args.xml and os.path.dirname(info.filename) != 'rsd':
        continue
      # print info.filename
      with archive.open(info, 'rU') as ifh:
        if args.xml:
          xobj = ET.parse(ifh)
          if args.tokenize:
            of.writelines([ ' '.join([ y.text for y in x.findall(".//TOKEN") ])+"\n" for x in xobj.findall(".//SEG") ])
          else:
            of.writelines([ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ])
        else:
          lines = ifh.readlines()
          for line in lines:
            of.write(line.decode('utf8'))
def main():
    import codecs

    parser = argparse.ArgumentParser(
        description="Extract and print psm annotat"
        "ion data from LRLP in a form that is amen"
        "able to insertion into future xml",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--infile",
        "-i",
        nargs="+",
        type=argparse.FileType("rb"),
        default=[sys.stdin],
        help="input zip file(s)" " (each contains a multi file)",
    )
    parser.add_argument(
        "--outfile",
        "-o",
        type=argparse.FileType("w"),
        default=sys.stdout,
        help="where to write extracted semantic info",
    )
    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    outfile = args.outfile

    nonehash = {"value": "None"}

    for infile in args.infile:
        inbase = ".".join(os.path.basename(infile.name).split(".")[:-2])
        archive = zf(infile)
        for info in archive.infolist():
            if info.file_size < 20:
                continue
            # Assume psm structure
            if os.path.dirname(info.filename) != "psm":
                continue
            with archive.open(info, "rU") as ifh:
                xobj = ET.parse(ifh)
                try:
                    headlines = [
                        (x.get("begin_offset"), x.get("char_length")) for x in xobj.findall("string[@type='headline']")
                    ]
                    # TODO: funornone this back into functional
                    postnodes = xobj.findall("string[@type='post']")
                    posts = []
                    for x in postnodes:
                        post = []
                        anode = x.find("attribute[@name='author']")
                        if anode is None:
                            anode = nonehash
                        dnode = x.find("attribute[@name='datetime']")
                        if dnode is None:
                            dnode = nonehash
                        posts.append(
                            (x.get("begin_offset"), x.get("char_length"), anode.get("value"), dnode.get("value"))
                        )
                except:
                    print(info.filename)
                    raise
                    sys.exit(1)

                # GENRE/LANG/DATE info will be gleaned from filename later.
                # assume psm.xml and strip it off
                fname = os.path.basename(info.filename).split(".psm.xml")[0]
                for h in headlines:
                    outfile.write("\t".join(("headline", fname) + h) + "\n")
                for p in posts:
                    outfile.write("\t".join(("post", fname) + p) + "\n")
Example #10
0
def main():
  parser = argparse.ArgumentParser(description="Extract and print monolingual" \
                                   " data, tokenized, morph, pos tag and " \
                                   "original, with manifests")
  parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('rb'),
                      default=[sys.stdin,],
                      help="input zip file(s) (each contains a multi file)")
  parser.add_argument("--outdir", "-o",
                      help="where to write extracted files")
  parser.add_argument("--nogarbage", action='store_true', default=False,
                      help="turn off garbage filtering")
  parser.add_argument("--toksubdir", default="tokenized",
                      help="subdirectory for tokenized files")
  parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                      help="subdirectory for cdec-tokenized files")
  parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                      help="subdirectory for tokenized files based on " \
                      "morphological segmentation")
  parser.add_argument("--morphsubdir", default="morph",
                      help="subdirectory for morphological information")
  parser.add_argument("--origsubdir", default="original",
                      help="subdirectory for untokenized files")
  parser.add_argument("--garbagesubdir", default="garbage",
                      help="subdirectory for garbage files (under orig)")
  parser.add_argument("--possubdir", default="pos",
                      help="subdirectory for pos tag files")
  parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir,
                                                              "cdectok.sh"),
                      help="cdec tokenizer program wrapper")

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))


  tokoutdir=os.path.join(args.outdir, args.toksubdir)
  origoutdir=os.path.join(args.outdir, args.origsubdir)
  cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir)
  morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir)
  morphoutdir=os.path.join(args.outdir, args.morphsubdir)
  posoutdir=os.path.join(args.outdir, args.possubdir)

  dirs = [args.outdir,
          tokoutdir,
          cdectokoutdir,
          origoutdir,
          morphtokoutdir,
          morphoutdir,
          posoutdir]
  if args.nogarbage:
    garbageoutdir = None
  else:
    garbageoutdir=os.path.join(origoutdir, args.garbagesubdir)
    dirs.append(garbageoutdir)
  for dir in dirs:
    if not os.path.exists(dir):
      os.makedirs(dir)

  defaultcount=0
  for infile in args.infile:
    inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2])
    if len(inbase) == 0:
      inbase="default.%d" % defaultcount
      defaultcount+=1
    archive = zf(infile)
    man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w')
    orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w')
    if args.nogarbage:
      garbage_fh = None
      garbage_man_fh = None
    else:
      garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w')
      garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w')
    tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w')
    morphtok_fh = open(os.path.join(morphtokoutdir,
                                           "%s.flat" % inbase), 'w')
    morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w')
    pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w')
    for info in archive.infolist():
      if info.file_size < 20:
        continue
      # assume ltf filename
      if not info.filename.endswith("ltf.xml"):
        continue
      # print info.filename
      with archive.open(info, 'rU') as ifh:
        try:
          xobj = ET.parse(ifh)
          docid = xobj.findall(".//DOC")[0].get('id')
          origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ]
          garbagemask = getgarbagemask(origlines, disabled=args.nogarbage)
          goodmask = [not x for x in garbagemask]
          seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ]
                      for x in xobj.findall(".//SEG") ]
          for line in compress(origlines, garbagemask):
            orig_fh.write(line)
          for tup in compress(seginfo, garbagemask):
            man_fh.write("\t".join(map(str, [info.filename,docid]+tup))+"\n")
          if not args.nogarbage:
            for line in compress(origlines, goodmask):
              garbage_fh.write(line)
            for tup in compress(seginfo, goodmask):
              garbage_man_fh.write("\t".join(map(str, [info.filename,docid]+tup))+"\n")
          for x in compress(xobj.findall(".//SEG"), garbagemask):
            tokens = x.findall(".//TOKEN")
            toktext = []
            morphtoktext = []
            morphtext = []
            postext = []
            for y in tokens:
              if y.text is None:
                continue
              toktext.append(y.text)
              postext.append(y.get("pos") or "none")
              for mt, mtt in morph_tok(y):
                morphtext.append(mt)
                morphtoktext.append(mtt)
            tok_fh.write(' '.join(toktext)+"\n")
            morphtok_fh.write(' '.join(morphtoktext)+"\n")
            morph_fh.write(' '.join(morphtext)+"\n")
            pos_fh.write(' '.join(postext)+"\n")
        except ET.ParseError:
          sys.stderr.write("Parse error on "+ifh.name+"\n")
          continue
    orig_fh.close()
    cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer,
                                         orig_fh.name,
                                         os.path.join(cdectokoutdir,
                                                      "%s.flat.lc" % inbase),
                                         os.path.join(cdectokoutdir,
                                                      "%s.flat" % inbase))
    p = subprocess.Popen(shlex.split(cdec_cmd))
    p.wait()
Example #11
0
def distance(distance, *args):

    if isinstance(distance, str):
        distance_list = [distance]
    elif isinstance(distance, list):
        distance_list = distance
    else:
        raise Exception(
            "Unknown distance type. Provide a name (str) or a list of str.")

    for dist in distance_list:
        if dist not in DISTANCES:
            raise Exception("Unknown distance " + dist +
                            ". The available ones are: " + ' '.join(DISTANCES))

    if len(args) == 1 and not isinstance(args[0], list):
        raise Exception(
            "Error: You only provided one language argument.\nProvide multiple language arguments, or a single list of languages as arguments."
        )
    if len(args) == 1 and isinstance(args[0], list):
        langs = args[0]
    else:
        langs = [l for l in args]
    for l in langs:
        if l not in DISTANCE_LANGUAGES:
            raise Exception(
                "Unknown language " + l +
                " (or maybe we don't have precomputed distances for this one)."
            )
    indeces = [DISTANCE_LANGUAGES.index(l) for l in langs]

    N = len(indeces)
    if N == 2:
        out = []
        with zf(DISTANCES_FILE, 'r') as zp:
            for dist in distance_list:
                data = sparse.load_npz(zp.open(map_distance_to_filename(dist)))
                if indeces[0] > indeces[1]:
                    out.append(data[indeces[1], indeces[0]])
                else:
                    out.append(data[indeces[0], indeces[1]])
        if len(out) > 1:
            return out
        else:
            return out[0]
    else:
        arr_list = [np.zeros((N, N)) for dist in distance_list]
        with zf(DISTANCES_FILE, 'r') as zp:
            for k, dist in enumerate(distance_list):
                data = sparse.load_npz(zp.open(map_distance_to_filename(dist)))
                for a, i in enumerate(indeces):
                    for b, j in enumerate(indeces):
                        if a != b:
                            if i > j:
                                arr_list[k][a, b] = data[j, i]
                            else:
                                arr_list[k][a, b] = data[i, j]
        if len(arr_list) > 1:
            return arr_list
        else:
            return arr_list[0]
Example #12
0
def import_data(year):

    t1_time = t.time()
    year = str(year)  # force into a string

    # create import folder if not available
    if path.exists(gv.data_dir):
        pass
    else:
        mkdir(gv.data_dir)

    # create landing folder if not available
    if path.exists(gv.data_dir + '/landing'):
        pass
    else:
        mkdir(gv.data_dir + '/landing')

    # download file into import/landing folder
    url = 'https://www.retrosheet.org/events/'
    # year = sys.argv[1]
    zip_file = year + 'eve.zip'
    urllib.request.urlretrieve(url + zip_file,
                               gv.data_dir + '/landing/' + zip_file)

    # create new folder for the unzipped contents
    if path.exists(gv.data_dir + '/' + year):
        pass
    else:
        mkdir(gv.data_dir + '/' + year)

    # unzip contents to the year folder
    try:
        with zf(gv.data_dir + '/landing/' + zip_file) as unzip:
            unzip.extractall(gv.data_dir + '/' + year)
    except Exception as e:
        # accept any types of errors
        el.error_logger(e, 'unzipping import year: ' + str(e), None, year, '')
        return False

    # remove landing file
    try:
        if path.exists(gv.data_dir + '/landing/' + zip_file):
            remove(gv.data_dir + '/landing/' + zip_file)
    except Exception as e:
        # accept any types of errors
        el.error_logger(e, 'removing landing file: ' + str(e), None, year, '')
        return False

    t2_time = t.time()

    # send completion notice
    conn = dbs.engine.connect()
    conn.fast_executemany = True
    finish_str = {
        'process_name': 'import_year',
        'data_year': year,
        'team_name': '---',
        'time_elapsed': t2_time - t1_time,
        'timestamp': t.strftime("%Y-%m-%d %H:%M:%S", t.localtime())
    }
    completion = pd.DataFrame([finish_str])
    completion.to_sql('process_log', conn, if_exists='append', index=False)

    return True
Example #13
0
def convertToCSV(inp):
    csvLogFile = open('/home/aking/misc/pythonscraper/sec/logCSVFileConversion.log', 'a')
    subprocess.call(['sh', '/home/aking/misc/pythonscraper/sec/convertToCSV.sh', inp], stdout=csvLogFile)
    csvLogFile.close()


def ss(str):
    return r.content[((r.content).rfind('\n',0,(r.content).find(str)))+1:((r.content).find('\n',(r.content).find(str)))]


r = requests.get('http://www.sec.gov/foia/iareports/inva-archive.htm')
fileExt = ss(d.strftime("%B %Y"))[ss(d.strftime("%B %Y")).index('/foia'):ss(d.strftime("%B %Y")).index('.zip')+4]
filePath = "/home/aking/misc/pythonscraper/sec/tmp/"
#filePath = sys.argv[2]

result=None
while result is None:
    try:
        r = requests.get('http://www.sec.gov'+fileExt)
        z = zf(sio(r.content))
        z.extractall(filePath)
        fileNames = map(convertName, z.namelist())
        map(convertToCSV, fileNames)
        result = "SUCCESS"
        logging.info(result + ": REQUEST SUCCESSFULLY HANDLED AT: " + d.strftime("%Y-%m-%d:%H:%M:%S"))
    except:
        logging.info("ERROR: COULD NOT HANDLE REQUEST...RETRYING AT : " + d.strftime("%Y-%m-%d:%H:%M:%S"))
        sleep(60)
        pass
Example #14
0
def main():
    import codecs
    parser = argparse.ArgumentParser(
        description="Extract lexicon file from xml")
    parser.add_argument("--infiles",
                        "-i",
                        nargs='+',
                        type=argparse.FileType('r'),
                        help="input lexicon files")
    parser.add_argument("--outfile", "-o", help="output file")
    parser.add_argument("--version",
                        "-v",
                        choices=["1.4", "1.5", "il3", "il5", "il6"],
                        default="1.5",
                        help="dtd version")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    outdir = os.path.dirname(args.outfile)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    outfile = args.outfile
    poslabel = "POS"
    if args.version == "1.4":
        entrylabel = "ENTRY"
        wordlabel = "WORD"
        glosslabel = "GLOSS"
        dopos = True
    elif args.version == "1.5":
        entrylabel = "ENTRY"
        wordlabel = "LEMMA"
        glosslabel = "GLOSS"
        dopos = True
    elif args.version == "il3":
        entrylabel = "ENTRY"
        wordlabel = "WORD"
        glosslabel = "DEFINITION"
        dopos = False
    elif args.version == "il6":
        entrylabel = "Entry"
        wordlabel = "FormRep"
        glosslabel = "Equiv"
        poslabel = "{http://www.ormld.com/OromoLanguageData/}POS"
        dopos = True
    else:
        pass

    # for printing out at the end
    stats = 0

    of = codecs.open(outfile, 'w', 'utf-8')
    source_fh = open(os.path.join(outdir, "source"), 'a')
    infiles = args.infiles
    if args.version == "il6":
        neofiles = []
        for infile in infiles:
            archive = zf(infile.name)
            for info in archive.infolist():
                if info.file_size < 20:
                    continue
                neofiles.append(TextIOWrapper(archive.open(info, 'r')))
        infiles = neofiles
    if args.version == "il5":
        for infile in infiles:
            for line in infile:
                toks = line.strip().split()
                of.write("{}\tUNK\t{}\n".format(' '.join(toks[1:]), toks[0]))
                stats += 1
    else:
        for infile in infiles:
            xobj = ET.parse(infile)
            try:
                entrysearch = ".//{}".format(entrylabel)
                for entry in xobj.findall(entrysearch):
                    # POS hacked out and GLOSS->DEFINITION for IL
                    words = entry.findall(".//%s" % wordlabel)
                    possearch = ".//{}".format(poslabel)
                    poses = [x.text
                             for x in entry.findall(possearch)] if dopos else [
                                 "UNK",
                             ]
                    glosses = entry.findall(".//%s" % glosslabel)
                    if len(poses) != len(glosses):
                        if len(poses) == 1:
                            poses = [poses[0]] * len(glosses)
                        elif len(poses) == 0:
                            poses = ["UNK"] * len(glosses)
                        else:
                            sys.stderr.write("{} poses\n".format(len(poses)))
                            raise SkipEntry(ET.dump(entry))
                    for word in words:
                        for pos, gloss in zip(poses, glosses):
                            if gloss.text is None or word.text is None or pos is None:
                                continue
                            stats += 1
                            of.write("%s\t%s\t%s\n" %
                                     (word.text.strip(), pos.strip(),
                                      gloss.text.strip()))
            except SkipEntry as e:
                raise


        source_fh.write("Extracted lexicon from %s to %s on %s\nusing %s; command" \
                        " issued from %s\n" % (infile.name, outfile,
                                               datetime.datetime.now(),
                                               ' '.join(sys.argv), os.getcwd()))

    # copy all files from lexicon directory to processed directory
    lexicon_dirs = set([os.path.dirname(x.name) for x in args.infiles])
    sys.stderr.write("Extracted %d entries\n" % (stats))
    for lexicon_dir in lexicon_dirs:
        for i in os.listdir(lexicon_dir):
            name = os.path.join(lexicon_dir, i)
            outname = '%s_%s' % (outfile, i)
            shutil.copy(name, outname)
            source_fh.write("Extracted extra lexicon from %s to %s\n" %
                            (name, outname))
Example #15
0
  morphoutdir=os.path.join(args.outdir, args.morphsubdir)

  dirs = [args.outdir,   
          tokoutdir,     
          cdectokoutdir, 
          origoutdir,    
          morphtokoutdir,
          morphoutdir]
  for dir in dirs:
    if not os.path.exists(dir):      
      os.makedirs(dir)
    

  for infile in args.infile:
    inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2])
    archive = zf(infile)
    man_fh = writer(open(os.path.join(args.outdir, "%s.manifest" % inbase), 'w'))
    orig_fh = writer(open(os.path.join(origoutdir, "%s.flat" % inbase), 'w'))
    tok_fh = writer(open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w'))
    morphtok_fh = writer(open(os.path.join(morphtokoutdir, "%s.flat" % inbase), 'w'))
    morph_fh = writer(open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w'))
    for info in archive.infolist():
      if info.file_size < 20:
        continue
      # assume ltf structure
      if os.path.dirname(info.filename) != 'ltf':
        continue
      # print info.filename
      with archive.open(info, 'rU') as ifh:
        xobj = ET.parse(ifh)
        docid = xobj.findall(".//DOC")[0].get('id')
Example #16
0
File: fd.py Project: roualdes/fd
def epa_ucmr_consolidate(fdDir):
    """Conslidate EPA UCMR data."""
    # TODO double check this function; use less memory
    # < memory: merge other data, read/write/merge/append all3/all2 in chunks?

    d = check_directory_consolidate(fdDir.joinpath('epa/ucmr'))
    qprint('Consolidating {0}...'.format(d), end="\r")

    with zf(str(d/'ucmr-3-occurrence-data.zip'), 'r') as zfile3, zf(str(d/'ucmr2_occurrencedata_jan12.zip'), 'r') as zfile2:
        all3 = pd.read_table(
            zfile3.open('UCMR3_All.txt'),
            encoding='latin1',
            dtype={
                'PWSID': str,
                'PWSName': str,
                'Size': str,
                'FacilityID': str,
                'FacilityName': str,
                'FacilityWaterType': str,
                'SamplePointID': str,
                'SamplePointName': str,
                'SamplePointType': str,
                'AssociatedFacilityID': str,
                'AssociatedSamplePointID': str,
                'CollectionDate': str,
                'SampleID': str,
                'Contaminant': str,
                'MRL': float,
                'MethodID': str,
                'AnalyticalResultsSign': str,
                'AnalyticalResultValue': float,
                'SampleEventCode': str,
                'MonitoringRequirement': str,
                'Region': str,
                'State': str,
            }
        )

        drt = pd.read_table(
            zfile3.open('UCMR3_DRT.txt'),
            encoding='latin1',
            dtype={
                'PWSID': str,
                'FacilityID': str,
                'SamplePointID': str,
                'SampleEventCode': str,
                'CollectionDate': str,
                'Disinfectant Type': str,
            }
        )

        all3 = pd.merge(
            all3, drt,
            how='left',
            on=[
                'PWSID',
                'FacilityID',
                'SamplePointID',
                'CollectionDate',
            ]
        )
        del drt

        zipcodes = pd.read_table(
            zfile3.open('UCMR3_ZipCodes.txt'),
            encoding='latin1',
            dtype={
                'PWSID': str,
                'ZIPCODE': str,
            })
        all3 = pd.merge(all3, zipcodes, how='left', on='PWSID')


        all2 = pd.read_table(
            zfile2.open('UCMR2_All_OccurrenceData_Jan12.txt'),
            encoding='latin1',
            dtype={
                'PWSID': str,
                'PWSName': str,
                'Size': str,
                'FacilityID': str,
                'FacilityName': str,
                'FacilityWaterType': str,
                'SamplePointID': str,
                'SamplePointName': str,
                'SamplePointType': str,
                'AssociatedFacilityID': str,
                'AssociatedSamplePointID': str,
                'DisinfectantType': str,
                'CollectionDate': str,
                'SampleID': str,
                'Contaminant': str,
                'MRL': float,
                'MethodID': str,
                'AnalyticalResultsSign': str,
                'AnalyticalResultValue': float,
                'SampleEventCode': str,
                'MonitoringRequirement': str,
                'Region': str,
                'State': str,
            }
        )

        all2 = pd.merge(all2, zipcodes, how='left', on='PWSID')

        all = all3.append(all2, ignore_index=True)
        del all3, all2

        csvfile = d / 'data.csv'
        with csvfile.open('a') as f:
            all.to_csv(f, index=False, float_format='%.2f')

    qprint('epa:ucmr data consolidated\x1b[K.')
Example #17
0
    def _do_pass(self, cr, uid, ids, context=None):
        """
        Open ZIP file, take the CSV file into and parse it to import payroll entries
        """
        # Do verifications
        if not context:
            context = {}

        # Verify that no draft payroll entries exists
        line_ids = self.pool.get('hr.payroll.msf').search(cr, uid, [('state', '=', 'draft')])
        if len(line_ids):
            raise osv.except_osv(_('Error'), _('You cannot import payroll entries. Please validate first draft payroll entries!'))

        # Prepare some values
        file_ext_separator = '.'
        file_ext = "csv"
        message = _("Payroll import failed.")
        res = False
        created = 0
        processed = 0

        header_vals = {}

        xyargv = self._get_homere_password(cr, uid, pass_type='payroll')

        filename = ""
        wiz_state = False
        # Browse all given wizard
        for wiz in self.browse(cr, uid, ids):
            if not wiz.file:
                raise osv.except_osv(_('Error'), _('Nothing to import.'))
            if not wiz_state:
                wiz_state = wiz.state

            # Decode file string
            fileobj = NamedTemporaryFile('w+b', delete=False)
            fileobj.write(decodestring(wiz.file))
            # now we determine the file format
            filename = fileobj.name
            fileobj.close()
            try:
                zipobj = zf(filename, 'r')
                filename = wiz.filename or ""
            except:
                raise osv.except_osv(_('Error'), _('Given file is not a zip file!'))
            if zipobj.namelist():
                namelist = zipobj.namelist()
                # Search CSV
                csvfile = None
                for name in namelist:
                    if name.split(file_ext_separator) and name.split(file_ext_separator)[-1] == file_ext:
                        csvfile = name
                if not 'envoi.ini' in namelist:
                    raise osv.except_osv(_('Warning'), _('No envoi.ini file found in given ZIP file!'))
                # Read information from 'envoi.ini' file
                field = False
                try:
                    import ConfigParser
                    Config = ConfigParser.SafeConfigParser()
                    Config.readfp(zipobj.open('envoi.ini', 'r', xyargv))
                    field = Config.get('DEFAUT', 'PAYS')
                except Exception, e:
                    raise osv.except_osv(_('Error'), _('Could not read envoi.ini file in given ZIP file.'))
                if not field:
                    raise osv.except_osv(_('Warning'), _('Field not found in envoi.ini file.'))
                # Read CSV file
                if csvfile:
                    try:
                        reader = csv.reader(zipobj.open(csvfile, 'r', xyargv), delimiter=';', quotechar='"', doublequote=False, escapechar='\\')
                        reader.next()
                    except:
                        fileobj.close()
                        raise osv.except_osv(_('Error'), _('Problem to read given file.'))
                    res = True
                    res_amount = 0.0
                    amount = 0.0
                    error_msg = ""
                    for line in reader:
                        processed += 1
                        update, amount, nb_created, vals, ccy, msg = self.update_payroll_entries(
                            cr, uid, data=line, field=field,
                            date_format=wiz.date_format,
                            wiz_state=wiz.state)
                        res_amount += round(amount, 2)
                        if not update:
                            res = False
                        if created == 0:
                            header_vals = vals
                            header_vals['currency_code'] = ccy
                        created += nb_created

                        if msg:
                            error_msg += "Line " + str(processed) + ": " + msg + " \n"

                    # Check balance
                    res_amount_rounded = round(res_amount, 2)
                    if res_amount_rounded != 0.0:
                        self._uf_side_rounding_line_check_gap(cr, uid,
                            header_vals['currency_id'],
                            header_vals['currency_code'],
                            header_vals['date'],
                            res_amount_rounded,
                            context=context)

                        # adapt difference by writing on payroll rounding line
                        pr_ids = self.pool.get('hr.payroll.msf').search(
                            cr, uid, [
                                ('state', '=', 'draft'),
                                ('name', '=', 'Payroll rounding')
                            ])
                        if not pr_ids:
                            # no SAGA BALANCE rounding line in file
                            # => create one UF side (US-201)
                            if wiz.state == 'simu':
                                self.write(cr, uid, [wiz.id], {
                                    'state': 'proceed',
                                    'msg': UF_SIDE_ROUNDING_LINE['msg_nb'] % (
                                        res_amount_rounded,
                                        header_vals['currency_code'] , )
                                })
                            else:
                                self._uf_side_rounding_line_create(cr, uid, ids,
                                    context=context, header_vals=header_vals,
                                    amount=-1 * res_amount_rounded)
                            #raise osv.except_osv(_('Error'), _('An error occured on balance and no payroll rounding line found.'))
                        else:
                            # Fetch Payroll rounding amount line and update
                            pr = self.pool.get('hr.payroll.msf').browse(cr, uid, pr_ids[0])
                            # To compute new amount, you should:
                            # - take payroll rounding amount
                            # - take the opposite of res_amount (wich is the current difference)
                            # - add both
                            new_amount = round(pr.amount, 2) + (-1 * res_amount_rounded)
                            self.pool.get('hr.payroll.msf').write(cr, uid, pr_ids[0], {'amount': round(new_amount, 2),})
                else:
                    raise osv.except_osv(_('Error'), _('Right CSV is not present in this zip file. Please use "File > File sending > Monthly" in Homère.'))
            fileobj.close()
Example #18
0
def extractallzip(rprt):
    for fold in os.listdir(rprt):
        if fnmatch.fnmatch(fold, '*.zip'):
            print('Extracting ' + fold + '...')
            zf(os.path.join(rprt + fold), 'r').extractall(rprt)
            print(fold + ' extracted !')