def CombineFeatureLabel(f_feature, f_label, f_out): pool = mp.Pool(com.__n_process) ffs = ['%s.%d.csv' % (util.file_basename(f_feature),i) for i in range(com.__n_process)] fos = ['%s.%d.csv' % (util.file_basename(f_out),i) for i in range(com.__n_process)] args_list = [] for i in range(com.__n_process): args_list.append((ffs[i],f_label, fos[i])) pool.map(_CombineFeatureLabel, args_list)
def test_file_basename(self): self.assertRaises(TypeError, util.file_basename, self.ione) self.assertRaises(TypeError, util.file_basename, self.fone) self.assertRaises(TypeError, util.file_basename, self.btrue) self.assertRaises(TypeError, util.file_basename, self.tsimple) self.assertRaises(TypeError, util.file_basename, self.lsimple) teststring1 = "file.txt" self.assertEqual(util.file_basename(teststring1), "file") teststring2 = "/home/user/file2.ext" self.assertEqual(util.file_basename(teststring2), "file2")
def TestModelOnData(modelname, fdata, flabel): actual_set = com.GetBuySet(flabel) rec_set = set() f_base = util.file_basename(fdata) re_str = f_base.replace('.',r'\.') + r'\.\d+\.csv$' f_list = util.FilterFile(re_str) #['%s.%d.csv' % (f_base, j) for j in range(com.__n_process)] for f in f_list: r , p, y = _ParTestModelOnData((modelname, f)) rec_set |= r pred_prob = np.concatenate([pred_prob,p]) Y_true = np.concatenate([Y_true, y]) TP = len(rec_set & actual_set) TN = len(rec_set - actual_set) FP = len(actual_set - rec_set) PrintConfuseMatrix(TP, TN, FP) P, R, F1 = GetPRF1(TP, TN, FP) PrintPRF1(P, R, F1) print 'AUC:', roc_auc_score( Y_true.astype(int), pred_prob) return TP, TN, FP, P, R, F1, pred_prob,Y_true
def FilterCSV(fn): ft = '%s.nofilter.csv' % util.file_basename(fn) if not os.path.exists(ft): os.rename(fn, ft) fo = fn fn = ft block_size = 100000 reader = pandas.read_csv(fn, iterator=True, chunksize=block_size) mod = 'w' header = True i = 0 rules = [LastdayRule] for data in reader: FilterDataWithRule(data, rules).to_csv(fo, mode=mod, header=header,index=False) mod = 'a' header=False i = i + len(data) print 'process %d rows.' % i
def FilterCSV(fn): ft = '%s.nofilter.csv' % util.file_basename(fn) if not os.path.exists(ft): os.rename(fn, ft) fo = fn fn = ft block_size = 100000 reader = pandas.read_csv(fn, iterator=True, chunksize=block_size) mod = 'w' header = True i = 0 rules = [LastdayRule] for data in reader: FilterDataWithRule(data, rules).to_csv(fo, mode=mod, header=header, index=False) mod = 'a' header = False i = i + len(data) print 'process %d rows.' % i
def CombineFeatureLabel(f_feature, f_label, f_out): pool = mp.Pool(com.__n_process) ffs = [ '%s.%d.csv' % (util.file_basename(f_feature), i) for i in range(com.__n_process) ] fos = [ '%s.%d.csv' % (util.file_basename(f_out), i) for i in range(com.__n_process) ] args_list = [] for i in range(com.__n_process): args_list.append((ffs[i], f_label, fos[i])) pool.map(_CombineFeatureLabel, args_list)
def CreateFeature(file, mode): fid = util.file_basename_id(file) GenFeature = util.load_model_from_name(util.file_basename(file)).GenFeature if mode=='train': GenFeature('tianchi_mobile_recommend_train_user.csv', 'feature%d.csv' % fid, lastday='2014-12-17') if mode=='submit': GenFeature('tianchi_mobile_recommend_train_user.csv', 'feature_total%d.csv' % fid, lastday='2014-12-18') if mode=='test': GenFeature('tianchi_mobile_recommend_train_user.csv', 'feature_test%d.csv' % fid, lastday='2014-12-16')
def CreateFeature(file, mode): fid = util.file_basename_id(file) GenFeature = util.load_model_from_name(util.file_basename(file)).GenFeature if mode == 'train': GenFeature('tianchi_mobile_recommend_train_user.csv', 'feature%d.csv' % fid, lastday='2014-12-17') if mode == 'submit': GenFeature('tianchi_mobile_recommend_train_user.csv', 'feature_total%d.csv' % fid, lastday='2014-12-18') if mode == 'test': GenFeature('tianchi_mobile_recommend_train_user.csv', 'feature_test%d.csv' % fid, lastday='2014-12-16')
FilterDataWithRule(data, rules).to_csv(fo, mode=mod, header=header,index=False) mod = 'a' header=False i = i + len(data) print 'process %d rows.' % i if __name__ == '__main__': if sys.argv[1]=='train': ff = 'feature.merge.csv' fl = 'label.csv' fd = 'data.csv' elif sys.argv[1]=='test': ff = 'feature_test.merge.csv' fl = 'label_test.csv' fd = 'data.test.csv' elif sys.argv[1]=='submit': ff = 'feature_total.merge.csv' else: print __doc__ sys.exit() pool = mp.Pool(com.__n_process) fs = util.FilterFile(util.file_basename(ff).replace('.',r'\.') + r'\.\d+\.csv') #print fs pool.map(FilterCSV, fs)
'--number', default=10000, dest='count', help='sample number') parser.add_option('-p', '--prob', type='float', dest='p', help='sample probability') #parser.add_option('-f','--file', dest='fname',help='file name to sample') (options, args) = parser.parse_args() #print options,args #sys.exit() fname = util.file_basename(args[0]) fd = open('%s.sample.csv' % fname, 'wb') writer = csv.writer(fd, delimiter=',') with open('%s.csv' % fname, 'rb') as f: reader = csv.reader(f, delimiter=',') i = 0 for row in reader: if i == 0 or options.p is None or random.random() < options.p: writer.writerow(row) i = i + 1 if options.p is None and i == options.count: break if i % 100000 == 0:
def GetFeature(data): fn = [i for i in data.columns if i not in ['user_id','item_id', 'buy']] data.assign(user_conver_rate=lambda x: x.user_buy_count/x.user_click_count) df = data[fn].apply(lambda x: np.log(x+1), axis=1) return df if __name__ == '__main__': X,Y = GetData() model_file = '%s.model' % util.file_basename(__file__) if not os.path.exists(model_file): lr = LogisticRegression(penalty='l1') lr.fit(X,Y) util.save_obj(lr, model_file) else: lr = util.load_obj(model_file) fn = X.columns.values pred = lr.predict_proba(X)[:,1] for i in range(len(fn)): print fn[i], lr.coef_[0][i] print f1_score(Y,pred>.5), roc_auc_score(X,pred)
index=False) mod = 'a' header = False i = i + len(data) print 'process %d rows.' % i if __name__ == '__main__': if sys.argv[1] == 'train': ff = 'feature.merge.csv' fl = 'label.csv' fd = 'data.csv' elif sys.argv[1] == 'test': ff = 'feature_test.merge.csv' fl = 'label_test.csv' fd = 'data.test.csv' elif sys.argv[1] == 'submit': ff = 'feature_total.merge.csv' else: print __doc__ sys.exit() pool = mp.Pool(com.__n_process) fs = util.FilterFile( util.file_basename(ff).replace('.', r'\.') + r'\.\d+\.csv') #print fs pool.map(FilterCSV, fs)
# coding: utf-8 ''' take subset of data usage: subset.py filename 1,2,3,4 ''' import csv, sys, util if len(sys.argv)!=3: print __doc__ else: cols = [int(i) for i in sys.argv[2].split(',')] f = open(sys.argv[1], 'rb') fr = csv.reader(f, delimiter=',') fd = open('%s.subset_%s.csv' % (util.file_basename(sys.argv[1]), '_'.join([str(i) for i in cols])), 'wb') fw = csv.writer(fd, delimiter=',') nrows = 0 for row in fr: fw.writerow([row[i] for i in cols]) nrows = nrows + 1 if nrows%100000==0: print 'processed %d rows!' % nrows f.close() fd.close()
print 'usage python merge_fast.py [f1 ... fn fo]' sys.exit() f = [] fr = [] fns = len(sys.argv)-2 for i in range(1,fns+1): fd = open(sys.argv[i],'rb') f.append(fd) reader = csv.reader(fd, delimiter=',') fr.append(reader) fo = sys.argv[-1] fo_base = util.file_basename(fo) fo_list = [open('%s.%d.csv' % (fo_base, j),'wb' ) for j in range(com.__n_process)] fw_list = [csv.writer(fo, delimiter=',') for fo in fo_list] fidx = 0 header = fr[0].next() for i in range(1,fns): header = header + fr[i].next()[2:] map(lambda fw: fw.writerow(header), fw_list) # write header nrows = 0 for row in fr[0]: for i in range(1,fns): newdata = fr[i].next() if (row[0]!=newdata[0] or row[1]!=newdata[1]):
def main(argv=None): if argv is None: argv = sys.argv[1:] # load defaults so we can use them below from config import FragItConfig cfg = FragItConfig() parser = OptionParser(usage=strings.usage, description=strings.description, version=strings.version_str) parser.add_option("-o", "--output", dest="outputfile", type=str, default="", metavar="filename") configuration = OptionGroup(parser, "Configuration") general = OptionGroup(parser, "Fragmentation") output = OptionGroup(parser, "Output") configuration.add_option("--use-config", dest="useconfigfile", type=str, default="", metavar="filename", help="Specify configuration file to use. This will ignore other command line parameters.") configuration.add_option("--make-config", dest="makeconfigfile", type=str, default="", metavar="filename", help="Specify a filename to use as a configuration file. Use command line options to modify defaults. It is possible to use this command without specifying an input file to generate a clean configuration file.") general.add_option("-m", "--maxfragsize", dest="maxFragmentSize", type=int, default=cfg.getMaximumFragmentSize(),metavar="integer", help="The maximum fragment size allowed [default: %default]") general.add_option("-g", "--groupcount", dest="groupcount", type=int, default=cfg.getFragmentGroupCount(),metavar="integer", help="Specify number of consecutive fragments to combine into a single fragment [default: %default]") general.add_option("--disable-protection", dest="disable_protection", action="store_true", default=False, help="Specify this flag to disable the use protection patterns.") general.add_option("--merge-glycine", action="store_true", dest="merge_glycine", default=False, help="Merge a glycine to the neighbor fragment when fragmenting proteins.") general.add_option("--merge-specific", dest="mergespecific", type=int, default=None, metavar="integer", help="Merge a specific fragment into all other fragments and remove it as a singular fragment.") general.add_option("--charge-model", dest="charge_model", default=cfg.getChargeModel(), help="Charge model to use [%default]") general.add_option("--combine-fragments", dest="combinefragments", type=str, default="",metavar="list of integers", help="Combines several fragments into one.") output.add_option("--output-format", dest="format", type=str, default=cfg.getWriter(), help="Output format [%default]") output.add_option("--output-boundaries", dest="boundaries", type=str, default="",metavar="list of floats", help="Specifies boundaries for multiple layers. Must be used with --central-fragment option") output.add_option("--output-central-fragment", dest="central_fragment", type=int, default=cfg.getCentralFragmentID(), metavar="integer", help="Specifies the fragment to use as the central one. Used in combination with --output-boundaries to make layered inputs") output.add_option("--output-active-distance", dest="active_atoms_distance", type=float, default=cfg.getActiveAtomsDistance(), metavar="float", help="Atoms within this distance from --output-central-fragment will be active. Use with --output-buffer-distance to add buffer region between active and frozen parts. [default: %default]") output.add_option("--output-buffer-distance", dest="maximum_buffer_distance", type=float, default=cfg.getBufferDistance(), metavar="float", help="Maximum distance in angstrom from active fragments from which to include nearby fragments as buffers. This option adds and extends to --output-boundaries. [default: %default]") output.add_option("--output-freeze-backbone", dest="freeze_backbone", action="store_true", default=cfg.getFreezeBackbone(), help="Option to freeze the backbone of the active region.") output.add_option("--output-jmol-script", dest="output_jmol_script", action="store_true", default=cfg.getWriteJmolScript(), help="Write a complimentary jmol script for visualization.") output.add_option("--output-pymol-script", dest="output_pymol_script", action="store_true", default=cfg.getWritePymolScript(), help="Write a complimentary pymol script for visualization.") parser.add_option_group(configuration) parser.add_option_group(general) parser.add_option_group(output) (options, args) = parser.parse_args(argv) if len(args) == 0 and len(options.makeconfigfile) > 0: cfg.writeConfigurationToFile(options.makeconfigfile) sys.exit() if len(args) != 1: parser.print_help() sys.exit() infile = args[0] molecule = fileToMol(infile) fragmentation = Fragmentation(molecule) # if there is a config file, read it and ignore other command line options if len(options.useconfigfile) > 0: fragmentation.readConfigurationFromFile(options.useconfigfile) (writer, output_extension) = get_writer_and_extension(fragmentation.getOutputFormat()) else: fragmentation.setChargeModel(options.charge_model) fragmentation.setMaximumFragmentSize(options.maxFragmentSize) fragmentation.setOutputFormat(options.format) if options.groupcount > 1: fragmentation.setFragmentGroupCount(options.groupcount) (writer, output_extension) = get_writer_and_extension(options.format) outfile = "%s%s" % (file_basename(infile), output_extension) if len(options.outputfile) > 0: outfile = options.outputfile # do the fragmentation procedure # 'fragmentation.doFragmentSpecificMerging()' should go somewhere here... fragmentation.setCombineFragments(options.combinefragments) if options.disable_protection: fragmentation.clearProtectPatterns() if options.merge_glycine: fragmentation.enableMergeGlycinePattern() fragmentation.beginFragmentation() fragmentation.doFragmentation() fragmentation.doFragmentMerging() fragmentation.doFragmentCombination() if fragmentation.getFragmentGroupCount() > 1: fragmentation.doFragmentGrouping() fragmentation.finishFragmentation() # write to file out = writer(fragmentation) # set options from command line boundaries = options.boundaries central_fragment = options.central_fragment active_atoms_distance = options.active_atoms_distance maximum_buffer_distance = options.maximum_buffer_distance freeze_backbone = options.freeze_backbone output_pymol_script = options.output_pymol_script output_jmol_script = options.output_jmol_script # set options from config file if len(options.useconfigfile) > 0: boundaries = fragmentation.getBoundaries() central_fragment = fragmentation.getCentralFragmentID() output_pymol_script = fragmentation.getWritePymolScript() output_jmol_script = fragmentation.getWriteJmolScript() freeze_backbone = fragmentation.getFreezeBackbone() maximum_buffer_distance = fragmentation.getBufferDistance() active_atoms_distance = fragmentation.getActiveAtomsDistance() # set the options out.setBoundariesFromString(boundaries) out.setCentralFragmentID(central_fragment) out.setActiveAtomsDistance(active_atoms_distance) out.setBufferMaxDistance(maximum_buffer_distance) if freeze_backbone: out.setFreezeBackbone() if output_pymol_script: out.setPymolOutput(infile,outfile) if output_jmol_script: out.setJmolOutput(infile,outfile) out.setup() out.writeFile(outfile) # write configuration file if len(options.makeconfigfile) > 0: fragmentation.setBoundaries(boundaries) fragmentation.writeConfigurationToFile(options.makeconfigfile)
def main(argv=None): if argv is None: argv = sys.argv[1:] # load defaults so we can use them below from config import FragItConfig cfg = FragItConfig() parser = OptionParser(usage=strings.usage, description=strings.description, version=strings.version_str) parser.add_option("-o", "--output", dest="outputfile", type=str, default="", metavar="filename") configuration = OptionGroup(parser, "Configuration") general = OptionGroup(parser, "Fragmentation") output = OptionGroup(parser, "Output") configuration.add_option( "--use-config", dest="useconfigfile", type=str, default="", metavar="filename", help= "Specify configuration file to use. This will ignore other command line parameters." ) configuration.add_option( "--make-config", dest="makeconfigfile", type=str, default="", metavar="filename", help= "Specify a filename to use as a configuration file. Use command line options to modify defaults. It is possible to use this command without specifying an input file to generate a clean configuration file." ) general.add_option( "-m", "--maxfragsize", dest="maxFragmentSize", type=int, default=cfg.getMaximumFragmentSize(), metavar="integer", help="The maximum fragment size allowed [default: %default]") general.add_option( "-g", "--groupcount", dest="groupcount", type=int, default=cfg.getFragmentGroupCount(), metavar="integer", help= "Specify number of consecutive fragments to combine into a single fragment [default: %default]" ) general.add_option( "--disable-protection", dest="disable_protection", action="store_true", default=False, help="Specify this flag to disable the use protection patterns.") general.add_option( "--merge-glycine", action="store_true", dest="merge_glycine", default=False, help= "Merge a glycine to the neighbor fragment when fragmenting proteins.") general.add_option( "--merge-specific", dest="mergespecific", type=int, default=None, metavar="integer", help= "Merge a specific fragment into all other fragments and remove it as a singular fragment." ) general.add_option("--charge-model", dest="charge_model", default=cfg.getChargeModel(), help="Charge model to use [%default]") general.add_option("--combine-fragments", dest="combinefragments", type=str, default="", metavar="list of integers", help="Combines several fragments into one.") output.add_option("--output-format", dest="format", type=str, default=cfg.getWriter(), help="Output format [%default]") output.add_option( "--output-boundaries", dest="boundaries", type=str, default="", metavar="list of floats", help= "Specifies boundaries for multiple layers. Must be used with --central-fragment option" ) output.add_option( "--output-central-fragment", dest="central_fragment", type=int, default=cfg.getCentralFragmentID(), metavar="integer", help= "Specifies the fragment to use as the central one. Used in combination with --output-boundaries to make layered inputs" ) output.add_option( "--output-active-distance", dest="active_atoms_distance", type=float, default=cfg.getActiveAtomsDistance(), metavar="float", help= "Atoms within this distance from --output-central-fragment will be active. Use with --output-buffer-distance to add buffer region between active and frozen parts. [default: %default]" ) output.add_option( "--output-buffer-distance", dest="maximum_buffer_distance", type=float, default=cfg.getBufferDistance(), metavar="float", help= "Maximum distance in angstrom from active fragments from which to include nearby fragments as buffers. This option adds and extends to --output-boundaries. [default: %default]" ) output.add_option( "--output-freeze-backbone", dest="freeze_backbone", action="store_true", default=cfg.getFreezeBackbone(), help="Option to freeze the backbone of the active region.") output.add_option( "--output-jmol-script", dest="output_jmol_script", action="store_true", default=cfg.getWriteJmolScript(), help="Write a complimentary jmol script for visualization.") output.add_option( "--output-pymol-script", dest="output_pymol_script", action="store_true", default=cfg.getWritePymolScript(), help="Write a complimentary pymol script for visualization.") parser.add_option_group(configuration) parser.add_option_group(general) parser.add_option_group(output) (options, args) = parser.parse_args(argv) if len(args) == 0 and len(options.makeconfigfile) > 0: cfg.writeConfigurationToFile(options.makeconfigfile) sys.exit() if len(args) != 1: parser.print_help() sys.exit() infile = args[0] molecule = fileToMol(infile) fragmentation = Fragmentation(molecule) # if there is a config file, read it and ignore other command line options if len(options.useconfigfile) > 0: fragmentation.readConfigurationFromFile(options.useconfigfile) (writer, output_extension) = get_writer_and_extension( fragmentation.getOutputFormat()) else: fragmentation.setChargeModel(options.charge_model) fragmentation.setMaximumFragmentSize(options.maxFragmentSize) fragmentation.setOutputFormat(options.format) if options.groupcount > 1: fragmentation.setFragmentGroupCount(options.groupcount) (writer, output_extension) = get_writer_and_extension(options.format) outfile = "%s%s" % (file_basename(infile), output_extension) if len(options.outputfile) > 0: outfile = options.outputfile # do the fragmentation procedure # 'fragmentation.doFragmentSpecificMerging()' should go somewhere here... fragmentation.setCombineFragments(options.combinefragments) if options.disable_protection: fragmentation.clearProtectPatterns() if options.merge_glycine: fragmentation.enableMergeGlycinePattern() fragmentation.beginFragmentation() fragmentation.doFragmentation() fragmentation.doFragmentMerging() fragmentation.doFragmentCombination() if fragmentation.getFragmentGroupCount() > 1: fragmentation.doFragmentGrouping() fragmentation.finishFragmentation() # write to file out = writer(fragmentation) # set options from command line boundaries = options.boundaries central_fragment = options.central_fragment active_atoms_distance = options.active_atoms_distance maximum_buffer_distance = options.maximum_buffer_distance freeze_backbone = options.freeze_backbone output_pymol_script = options.output_pymol_script output_jmol_script = options.output_jmol_script # set options from config file if len(options.useconfigfile) > 0: boundaries = fragmentation.getBoundaries() central_fragment = fragmentation.getCentralFragmentID() output_pymol_script = fragmentation.getWritePymolScript() output_jmol_script = fragmentation.getWriteJmolScript() freeze_backbone = fragmentation.getFreezeBackbone() maximum_buffer_distance = fragmentation.getBufferDistance() active_atoms_distance = fragmentation.getActiveAtomsDistance() # set the options out.setBoundariesFromString(boundaries) out.setCentralFragmentID(central_fragment) out.setActiveAtomsDistance(active_atoms_distance) out.setBufferMaxDistance(maximum_buffer_distance) if freeze_backbone: out.setFreezeBackbone() if output_pymol_script: out.setPymolOutput(infile, outfile) if output_jmol_script: out.setJmolOutput(infile, outfile) out.setup() out.writeFile(outfile) # write configuration file if len(options.makeconfigfile) > 0: fragmentation.setBoundaries(boundaries) fragmentation.writeConfigurationToFile(options.makeconfigfile)
import csv, sys, util, random from optparse import OptionParser parser = OptionParser() parser.add_option('-n','--number', default=10000, dest='count',help='sample number') parser.add_option('-p','--prob', type='float', dest='p',help='sample probability') #parser.add_option('-f','--file', dest='fname',help='file name to sample') (options, args) = parser.parse_args() #print options,args #sys.exit() fname = util.file_basename(args[0]) fd = open('%s.sample.csv' % fname,'wb') writer = csv.writer(fd, delimiter=',') with open('%s.csv' % fname, 'rb') as f: reader = csv.reader(f, delimiter=',') i = 0 for row in reader: if i==0 or options.p is None or random.random()<options.p: writer.writerow(row) i = i+1 if options.p is None and i==options.count: