def bridgemapper(istm, ostm, strMap, strCols, strFrom, strTo, ostmLog, iSkip): strCols = strCols[1:-1] aiCols = [int(s) for s in re.split(r'\s*,\s*', strCols)] if strCols else [] #Open a blank metadata object and initialize pMeta = metadata.open() pMeta.set("mapped", False) aastrData = [] setstrIn = set() csvr = csv.reader(istm, csv.excel_tab) for astrLine in csvr: aastrData.append(astrLine) pMeta.update_md5sum("\t".join(astrLine)) if csvr.line_num < iSkip: continue for iCol in aiCols: if iCol < len(astrLine): setstrIn.add(astrLine[iCol]) else: sys.stderr.write( " +++ ERROR in GeneMapper +++ Number of requested columns to \ map is larger than the number of columns in the input data file.\n") pMeta.store_checksum() hashMap = None # Make sure mapping file exists and has nonzero file size if strMap and os.path.exists(strMap) and (os.stat(strMap)[6] > 0): hashMap = convertGeneIds(setstrIn, strMap, strFrom, strTo) else: sys.stderr.write( "+++ ERROR in GeneMapper +++ Input file does not exist or is empty. \ Return empty file. \n") if hashMap: if any(hashMap.values()): pMeta.set("mapped", True) for iRow in range(iSkip, len(aastrData)): astrRow = aastrData[iRow] for iCol in aiCols: if iCol < len(astrRow): strTo = hashMap.get(astrRow[iCol]) if strTo: astrRow[iCol] = strTo else: astrRow[iCol] = "" else: sys.stderr.write("+++Error in GeneMapper +++ Empty mapping. \ Return original file. \n") csvw = csv.writer(ostm, csv.excel_tab) #make sure that if the mapping is empty for one of the columns, delete the entire row for astrLine in aastrData: if all(astrLine): csvw.writerow(astrLine) if ostmLog: pMeta.save_text(ostmLog)
def _main(): args = argp.parse_args() iLC = sfle.lc(args.istm.name) if (args.strFrom == args.strTo) or (iLC >= args.iMaxLines): #if the two gene identifier types are the same, or if the line count exceeds iMaxLines, #return the input file pAastrData = csv.reader(args.istm, csv.excel_tab) csvw = csv.writer(args.ostm, csv.excel_tab) pMeta = metadata.open() for astrLine in pAastrData: csvw.writerow(astrLine) pMeta.update_md5sum("\t".join(astrLine)) pMeta.store_checksum() if args.ostmLog: pMeta = metadata.open() pMeta.set("mapped", True) pMeta.save_text(args.ostmLog) elif not (args.strMap) or not (os.stat(str(args.strMap))[6]): #if there is no map file specified pAastrData = csv.reader(args.istm, csv.excel_tab) csvw = csv.writer(args.ostm, csv.excel_tab) pMeta = metadata.open() for astrLine in pAastrData: csvw.writerow(astrLine) pMeta.update_md5sum("\t".join(astrLine)) pMeta.store_checksum() if args.ostmLog: pMeta.set("mapped", False) pMeta.save_text(args.ostmLog) else: #if gene sniffer flag is on, try to guess the best possible gene identifier if args.fSniffer: args.strFrom = gene_sniffer(args.istm, args.strCols) bridgemapper(args.istm, args.ostm, args.strMap, args.strCols, args.strFrom, args.strTo, args.ostmLog, args.iSkip)
def makeunique(istm, ostm, strSplit, iCols, iSkip, ostmLog): ''' Splits up compressed elements (e.g. a///b c///d) into their cartesian products (e.g. a c, a d, b c, bd); then, gets rid of duplicate elements in the set of unordered tuples (equivalence under permutation). User can specify how many lines to skip beforehand, and how many columns are to be considered ''' aastrMatIn = [x for x in csv.reader(istm, csv.excel_tab)] aastrHeaders, aastrDataIn = aastrMatIn[:iSkip], aastrMatIn[iSkip:] #open a blank metadata object pMeta = metadata.open() if strSplit: aastrSplit = [] for astrRow in aastrDataIn: astrNames, astrVals = astrRow[:iCols], astrRow[iCols:] #Choice: keep rows without values if any(astrNames) and reduce( lambda y, z: y or z, [x.find(strSplit) != -1 for x in astrNames]): astrSplit = [[y.strip() for y in x.split(strSplit)] for x in astrNames] aastrSplit += [ list(v) + astrVals for v in [x for x in itertools.product(*astrSplit)] ] else: aastrSplit.append(astrRow) else: aastrSplit = aastrDataIn pNames = set([]) aastrUnique = [] for astrRow in aastrSplit: astrNames, astrVals = astrRow[:iCols], astrRow[iCols:] lenNames = len(astrNames) if (len(frozenset(astrNames)) == lenNames and not (frozenset(astrNames) in pNames)): pNames |= {frozenset(astrNames)} aastrUnique.append(astrRow) pMeta.set("mapped", any(aastrUnique)) #write output csvw = csv.writer(ostm, csv.excel_tab) for row in aastrHeaders + aastrUnique: csvw.writerow(row) #save metadata if ostmLog: pMeta.save_text(ostmLog)
import arepa import csv import sys import metadata c_strCurated = "curated" if not (2 <= len(sys.argv[1:]) <= 3): raise Exception( "usage: pkl2metadata.py <ID.pkl> <per-exp.txt> [per-cond.txt]") c_fileIDpkl, c_fileExpTable = sys.argv[1:3] c_fileCondTable = sys.argv[3] if len(sys.argv[1:]) > 2 else None hashMeta = metadata.open(open(c_fileIDpkl, "r")) def writeTable(hMeta, astrKeys, outputf, bIter=False): hMeta = {k: hMeta[k] for k in astrKeys} csvw = csv.writer(open(outputf, "w"), csv.excel_tab) if bIter: astrHeader = hMeta.get("sample_name") or hMeta.get("") _astrKeys = [x for x in astrKeys if hMeta.get(x) != astrHeader] csvw.writerow(["sample_name"] + _astrKeys) for iSample, strSample in enumerate(astrHeader): csvw.writerow( [str(x).replace("\n", " ") for x in [ strSample ] + [hMeta.get(s)[iSample] for s in \ _astrKeys]]) else: csvw.writerow(astrKeys) csvw.writerow(
strMethods, strAuthors, strPMIDs, strTaxAs, strTaxBs, strTypes, strDBs, strIDs, strConfs): metadatum(pMetadata.taxid, [strTaxAs, strTaxBs], 1) metadatum(pMetadata.pmid, [strPMIDs], 1) metadatum(pMetadata.type, [strTypes.lower()], 2) metadatum(pMetadata.technique, [strMethods], 2) if len(sys.argv) < 2: raise Exception("Usage: c2metadata.py <id> < <mpidbc>") strTarget = sys.argv[1] strStatus = sys.argv[2] if len(sys.argv[1:]) > 1 else None strManual = sys.argv[3] if len(sys.argv[1:]) > 2 else None pMetadata = metadata.open() cfile.read(sys.stdin, c_iColumns, strTarget, callback, pMetadata) if strStatus: strMapped, strBool = [ x for x in csv.reader(open(strStatus), csv.excel_tab) ][0] fMapped = True if strBool == "True" else False pMetadata.set(strMapped, fMapped) if strManual: for strKey, strVal in csv.reader(open(strManual), csv.excel_tab): pMetadata.set(strKey, strVal) pMetadata.save(sys.stdout)