############################################### if args.prefix is not None: raw_fetch_fname = os.path.join( args.prefix, args.raw_fetch ) gb_fname = os.path.join( args.prefix, args.genbank ) else: raw_fetch_fname = args.raw_fetch gb_fname = args.genbank # get the common path for later use ... common_path = os.path.commonprefix([raw_fetch_fname,gb_fname]) common_path = os.path.dirname(common_path) # # # don'r forget to provide you email # Entrez.email = args.email if args.email else "your_email@mail_server.com" crit_threshold = args.threshold # Reading genbank mindfully next ... gbrecs = ms.genebank_fix_n_read(gb_fname) ###################################### # assign some module internal stuff ... ms.gbrecs = gbrecs ############################ # READING file containing GeneName(and/or locus) and FetchID association ... print "Reading %s with the updated spectrum that includes fetchid column ..."%raw_fetch_fname raw_fetch = pd.read_csv(raw_fetch_fname) # here is the NEW plan!: # first, we try to assign a single protein to each peptide # we collect peptide-protein pairs that failed to match, declare them BAD and send them to manuall processing ... #####################################################################################################
from StringIO import StringIO import warnings from Bio import BiopythonWarning, BiopythonParserWarning import subprocess as sub dest = "../PULLED_PROTEINS_TOTAL" # get file names of all the pulled files from destination ... pulled_files = sub.check_output(['ls',dest]) pulled_files = pulled_files.strip().split('\n') pulled_files = ['/'.join([dest,fname]) for fname in pulled_files] # # # Reading genbank mindfully next ... gbrecs_id = [ms.genebank_fix_n_read(fn,'id') for fn in pulled_files] gbrecs_gi = [ms.genebank_fix_n_read(fn,'gi') for fn in pulled_files] # # # ###################################### # # # # assign some module internal stuff ... # # # ms.gbrecs = gbrecs # perform some simple and stupid tests ... id_keys = [np.asarray(gg.keys()) for gg in gbrecs_id] gi_keys = [np.asarray(gg.keys()) for gg in gbrecs_gi] ##################################################################################### print "Proteins in each of id groups are unique, True or False:" answ1 = np.asarray([f.size==np.unique(f).size for f in id_keys]).all() print answ1 ##################################################################################### print "Proteins in each of gi groups are unique, True or False:"