def get_pdf(assembly_id, range=(None, None), bill_ids=None): if bill_ids is not None and not bill_ids: return datadir = '%s/%s' % (DIR['data'], assembly_id) pdfdir = '%s/%s' % (DIR['pdf'], assembly_id) txtdir = '%s/%s' % (DIR['txt'], assembly_id) utils.check_dir(pdfdir) utils.check_dir(txtdir) failed = [] jsons = os.listdir(datadir)[range[0]:range[1]] for json in jsons: if bill_ids and json.split('.', 1)[0] not in bill_ids: continue print json try: download(assembly_id, json, datadir, pdfdir) pdffile = '%s/%s' % (pdfdir, json.replace('json', 'pdf')) txtfile = '%s/%s' % (txtdir, json.replace('json', 'txt')) #TODO: apply celery try: pdf2txt(pdffile, txtfile) except (PSEOF, PDFSyntaxError) as e: print 'Failed parsing %s with %s' % (json, e) failed.append((json, e)) except IOError as e: print 'File not exists' % (json, e) failed.append((json, e)) except (IndexError, TypeError) as e: print 'Failed downloading %s with %s' % (json, e) failed.append((json, e)) print 'Failed files: ', failed
def get_html(assembly_id, npages): def get_page(baseurl, page, directory, npages): try: url = baseurl + '&PAGE=%d&PAGE_SIZE=%d' % (page, PAGE_SIZE) pn = npages - page + 1 fn = '%s/%d.html' % (directory, pn) is_first = True while is_first or 'TEXTAREA ID="MSG" STYLE="display:none"' in doc: doc = utils.get_webpage_text(url) is_first = False with open(fn, 'w') as f: f.write(doc) sys.stdout.write('%s\t' % pn) sys.stdout.flush() except (requests.exceptions.RequestException, IOError) as e: print '\nFailed to get %s due to %s' % (fn, e.__repr__) baseurl, directory = convert(assembly_id) utils.check_dir(directory) # print 'Downloading:' jobs = [gevent.spawn(get_page, baseurl, page, directory, npages)\ for page in range(1, npages+1)] gevent.joinall(jobs) return npages
def main(): # Get arguments args = parse_args() filein, join_path, io_type = args.filename, args.join, args.type # Parse the document with open(filein) as fin: text = fin.read() examples = DocParser(text).parse() # Get files to join if join_path: targets = [] filenames = get_joining_filenames(join_path, len(examples), io_type) for filename in filenames: with open(os.path.join(join_path, filename)) as fin: try: target = json.load(fin, object_pairs_hook=OrderedDict) except json.JSONDecodeError: raise ValueError('{} has invalid format'.format(filename)) from None targets.append(target) else: targets = (OrderedDict() for _ in iter(int, 1)) # Turn data into output dicts dicts_in = dicts_out = [OrderedDict() for _ in range(len(examples))] extend_dicts(examples, dicts_in, dicts_out) # Check output directory check_dir(OUT_DIR) # Write target dicts to files export_examples(dicts_in, io_type, 'json')
def main(): args = argparser.parse_args() emf_file = args.emf out_root = args.outroot # img_root = args.imgroot # tree_root = args.treeroot clades_pickle = args.species_cache prefix = args.prefix all_species = utils.ens_get("/info/species/")["species"] all_species_names = [ it["name"].replace("_", " ") for it in all_species ] all_species_names.remove("Ancestral sequences") if path.exists(clades_pickle): Clades = pickle.load(open(clades_pickle, 'rb')) else: Clades = filter_clades(all_species_names, [ "Eutheria", "Glires", "Laurasiatheria", "Sauria", "Mammalia", "Primates" ]) pickle.dump(Clades, open(clades_pickle, 'wb')) pprint(Clades) TL = TCCList() TL.add(TCC(Clades[args.clade], operator.ge, args.thr)) utils.check_dir(path.join(out_root, args.clade)) tree_id = 1 for tree in emf.EMF(emf_file): print tree_id # treedir = path.join(tree_root, str(tree_id)[:2]) # utils.check_dir(treedir) # tree.write(outfile=path.join(treedir, "{}.nh".format(tree_id))) seqsets, subtrees = split_tree(tree, TL, prefix) outdir = path.join(out_root, args.clade, str(tree_id)[:2]) utils.check_dir(outdir) # Treevis # layout = make_layout(seqsets) # imgdir = path.join(img_root, args.clade) # utils.check_dir(imgdir) # imgfile = path.join(imgdir, "{}.pdf".format(tree_id)) # tree.render(imgfile, layout=layout) set_id = 1 for seqset, subtree in zip(seqsets, subtrees): outfile = open(path.join(outdir, "{0}_{1}.tab".format(tree_id, set_id)), 'w') for seqid in seqset: print >>outfile, '\t'.join(seqid) subtree.write(outfile=path.join(outdir, "{0}_{1}.nh".format(tree_id, set_id)), format=6) set_id += 1 tree_id += 1
def __write_fastqc(self, o, pobj, r): # Dict for reporter incurr = { 'leftseq': {'data': None,'summary': None}, 'rightseq': {'data': None,'summary': None}, 'paired': r.paired, 'readgroup': r.readgroup } ocurr = { 'leftseq': None, 'rightseq': None, 'paired': r.paired, 'readgroup': r.readgroup } # Set files fqdir = os.path.join(pobj.files['results_path'], \ '{0.name}/00_Preprocessing/fastqc/{1.readgroup}'.format(self, r)) o.write('# Fastqc\n') o.write('fastqcResultsDir = {0}\n'.format(fqdir)) raw1 = r.leftseq out1 = [i for i in [re.search('([\w\_\-\.\d]+)\.txt\.gz$', raw1), re.search('([\w\_\-\.\d]+)\.txt$', raw1), re.search('([\w\_\-\.\d]+)\.fastq\.gz$', raw1), re.search('([\w\_\-\.\d]+)\.fastq$', raw1)] if i][0].group(1) o.write('leftFastqcResultsDir = {0}_fastqc\n'.format(out1)) incurr['leftseq']['data'] = os.path.join(fqdir, '{0}_fastqc/fastqc_data.txt'.format(out1)) incurr['leftseq']['summary'] = os.path.join(fqdir, '{0}_fastqc/summary.txt'.format(out1)) ocurr['leftseq'] = os.path.join(pobj.files['report_path'], \ 'project_reads/{0.name}/{1.readgroup}/leftseq'.format(self, r)) utils.check_dir(ocurr['leftseq']) self.reporter_files.append(os.path.join(ocurr['leftseq'], 'fastqc_data.txt')) self.reporter_files.append(os.path.join(ocurr['leftseq'], 'fastqc_gcbd.txt')) self.reporter_files.append(os.path.join(ocurr['leftseq'], 'fastqc_pbnc.txt')) self.reporter_files.append(os.path.join(ocurr['leftseq'], 'fastqc_qbd.txt')) if r.paired: raw2 = r.rightseq out2 = [i for i in [re.search('([\w\_\-\.\d]+)\.txt\.gz$', raw2), re.search('([\w\_\-\.\d]+)\.txt$', raw2), re.search('([\w\_\-\.\d]+)\.fastq\.gz$', raw2), re.search('([\w\_\-\.\d]+)\.fastq$', raw2)] if i][0].group(1) o.write('rightFastqcResultsDir = {0}_fastqc\n'.format(out2)) incurr['rightseq']['data'] = os.path.join(fqdir, '{0}_fastqc/fastqc_data.txt'.format(out2)) incurr['rightseq']['summary'] = os.path.join(fqdir, '{0}_fastqc/fastqc_summary.txt'.format(out2)) ocurr['rightseq'] = os.path.join(pobj.files['report_path'], \ 'project_reads/{0.name}/{1.readgroup}/rightseq'.format(self, r)) utils.check_dir(ocurr['rightseq']) self.reporter_files.append(os.path.join(ocurr['rightseq'], 'fastqc_data.txt')) self.reporter_files.append(os.path.join(ocurr['rightseq'], 'fastqc_gcbd.txt')) self.reporter_files.append(os.path.join(ocurr['rightseq'], 'fastqc_pbnc.txt')) self.reporter_files.append(os.path.join(ocurr['rightseq'], 'fastqc_qbd.txt')) o.write('\n') # Add to dict self.reporter_obj['reads']['inputs'][r.phenotype.lower()].append(incurr) self.reporter_obj['reads']['outputs'][r.phenotype.lower()].append(ocurr)
def rrd_graph_sum(ldir, loc, width=1080, height=384): if len(loc['hosts']) == 0: logging.warn("location `%s': no host(s) found" % (loc['name'],)) return lid = osp.join(ldir, loc['id']) check_dir(0755, lid) rrds.rrd_graph_sum_net(lid, loc['name'], loc['hosts'], width, height) rrds.rrd_graph_sum_udp(lid, loc['name'], loc['hosts'], width, height) rrds.rrd_graph_sum_cpu(lid, loc['name'], loc['hosts'], width, height) rrds.rrd_graph_sum_mem(lid, loc['name'], loc['hosts'], width, height) return
def output(self): job = Job(job_id=self.jobid) # print(job.path_map.dir_dict) self.output_file = os.path.join(job.path_map.tmp_dir, 'dir_dict.yaml') for path in job.path_map.dir_dict.itervalues(): utils.check_dir(path=path) # print(path) yaml.dump(data=job.path_map.dir_dict, stream=open(self.output_file, 'w')) return luigi.LocalTarget(path=self.output_file)
def checkpoint(self): self._update_request() occupation = numpy.array([len(self.wally[self.wally[i,j,:,0] != 0]) for i in xrange(self.size) for j in xrange(self.size)]).reshape(self.size,self.size) utils.check_dir('wally') utils.pyplot_from_array(str(self.tick), occupation, self.capacity) self.conn.root['conf']['tick'] = self.tick self.conn.root['wally'] = self.wally self.conn.root['rookies'] = self.rookies self.conn.commit()
def get_npages(assembly_id): url, directory = convert(assembly_id) utils.check_dir(directory) fn = '%s/tmp.html' % directory utils.get_webpage(url, fn) page = utils.read_webpage(fn) m = re.search(u'총(.+)건', page.xpath('//span[@class="text3"]/text()')[0]) nbills = int(m.group(1)) npages = int(math.ceil(nbills/float(PAGE_SIZE))) print 'Total %d bills, %d pages to %s' % (nbills, npages, directory) return npages
def search(self, data, classes): """ Searches for the patterns based on expression data Input ----- data : numpy array[n_samples, n_genes], GE data classes : numpy zero-one array[n_samples] """ self.patterns = [] c = 0 time_d = 0 for seed in self.seeds: # print seed c += 1 if self.verbose and c % 100 == 0: print "Searching with seed %s" % str(seed) print np.mean(time_d) time_d = 0 pattern = self.search_method.create_pattern(data, seed) pattern.evaluate(data, self.metric, classes) st = time.clock() while True: next_pattern = max( pattern.expand(self.network, self.radius), key=lambda ex: ex.evaluate(data, self.metric, classes) ) if (next_pattern.score / pattern.score) > 1 + self.min_improve: pattern = next_pattern # print "zlepseni",pattern.score else: break # pattern.edges = filter_edges(pattern.edges, pattern.genes) time_d += time.clock() - st if self.trivial_patterns or len(list(seed)[0]) > 2: self.patterns += [pattern] check_dir(self.base_dir + "greedy_search_pics/") if self.draw: gene_color = dict() for gene in pattern.genes: edges_names = set((self.gene_names[h1], self.gene_names[h2]) for (h1, h2) in pattern.edges) # a function to color a gene in discovered pattern gene_color[self.gene_names[gene]] = scipy.stats.ttest_ind(data[:, -1], GE_profile=data[:, gene]) print "Drawing a graph for seed %s" % str(seed) draw_graph(edges_names, self.base_dir + "greedy_search_pics/test-graph-greedy", seed) # if seed > 550: # break return self.patterns
def html2json(assembly_id, range=(None, None), bill_ids=None): if bill_ids is not None and not bill_ids: return metafile = '%s/%d.csv' % (DIR['meta'], assembly_id) print metafile meta = pd.read_csv(metafile, dtype={'bill_id': object, 'link_id': object}) jsondir = '%s/%s' % (DIR['data'], assembly_id) utils.check_dir(jsondir) if not bill_ids: bill_ids = meta['bill_id'][range[0]:range[1]] jobs = [gevent.spawn(parse_page, assembly_id, bill_id, meta, jsondir) for bill_id in bill_ids] gevent.joinall(jobs)
def main(): # Get arguments args = parse_args() # Parse the document with open(args.filename) as fin: text = fin.read() examples = DocParser(text).findall() # Check output directory check_dir(OUT_DIR) # Generate templates def gen(gters): for gter in gters: gter(examples).generate() gen([PythonGenerator, JavaGenerator])
def html2csv(assembly_id, npages): def list_to_file(l, f): f.write('"') f.write('","'.join(l).encode("utf-8")) f.write('"\n') def parse_columns(columns): data = [] for j, c in enumerate(columns): if j == 1: status = str(int(re.findall(r"[0-9]+", c.xpath("img/@src")[0])[0])) title = c.xpath("a/text()")[0].replace('"', "'") link = re.findall(r"\w+", c.xpath("a/@href")[0])[2] data.extend([status, title, link]) elif j == 6: data.append("1" if c.xpath("img/@onclick") else "0") else: data.append(c.xpath("text()")[0].strip()) return data def parse_page(page, f, assembly_id): fn = "%s/%s/%d.html" % (DIR["list"], assembly_id, page) p = utils.read_webpage(fn) rows = utils.get_elems(p, X["table"]) for r in reversed(rows): columns = r.xpath(X["columns"]) if len(columns) == 8: p = parse_columns(columns) list_to_file(p, f) sys.stdout.write("%d\t" % page) sys.stdout.flush() directory = DIR["meta"] utils.check_dir(directory) meta_data = "%s/%d.csv" % (directory, assembly_id) print "\nParsing:" with open(meta_data, "wa") as f: list_to_file(META_HEADERS, f) for page in range(1, npages + 1): parse_page(page, f, assembly_id) print "\nMeta data written to " + meta_data
def main(args): printer = print_csv if args.test else print_json filetype = 'csv' if args.test else 'json' datadir = args.directory if args.directory else '.' check_dir(datadir) if args.target=='local': if args.end: jobs = [] args.level = get_election_type_name(args.level) for n in xrange(args.start, args.end+1): filename = '%s/%s-%s-%s-%d.%s'\ % (datadir, args.target, args.level, args.type, n, filetype) job = gevent.spawn(crawl, target=args.target, level=args.level,\ _type=args.type, nth=n, filename=filename, printer=printer) jobs.append(job) gevent.joinall(jobs) else: n = args.start args.level = get_election_type_name(args.level) filename = '%s/%s-%s-%s-%.01f.%s' %\ (datadir, args.target, args.level, args.type, n, filetype) crawl(target=args.target, level=args.level, _type=args.type, nth=n,\ filename=filename, printer=printer) else: if args.end: jobs = [] for n in xrange(args.start, args.end+1): filename = '%s/%s-%s-%d.%s'\ % (datadir, args.target, args.type, n, filetype) job = gevent.spawn(crawl, target=args.target, _type=args.type, nth=n,\ filename=filename, printer=printer) jobs.append(job) gevent.joinall(jobs) else: n = args.start filename = '%s/%s-%s-%.01f.%s' %\ (datadir, args.target, args.type, n, filetype) crawl(target=args.target, _type=args.type, nth=n,\ filename=filename, printer=printer) print 'Data written to %s' % filename
def get_pdf(assembly_id, range=(None, None), bill_ids=None): if bill_ids is not None and not bill_ids: return indir = '%s/%s' % (DIR['data'], assembly_id) outdir = '%s/%s' % (DIR['pdf'], assembly_id) utils.check_dir(outdir) failed = [] jsons = os.listdir(indir)[range[0]:range[1]] for json in jsons: if bill_ids and json.split('.', 1)[0] not in bill_ids: continue try: download(assembly_id, json, indir, outdir) except (IndexError, TypeError) as e: print 'Failed downloading %s with %s' % (json, e) failed.append((json, e)) print 'Failed files: ', failed
def get_html(assembly_id, range=(None, None), bill_ids=None): if bill_ids is not None and not bill_ids: return for field in HTML_FIELDS: utils.check_dir('%s/%s' % (DIR[field], assembly_id)) metadata = get_metadata(assembly_id, range=range) for bill_id in metadata: if bill_id == 'bill_id': continue if bill_ids and bill_id not in bill_ids: continue link_id, has_summaries = metadata[bill_id] for field in HTML_FIELDS[1:3]: get_page(assembly_id, bill_id, link_id, field) get_specifics(assembly_id, bill_id, link_id) get_summaries(assembly_id, bill_id, link_id, has_summaries) sys.stdout.write('%s\t' % bill_id) sys.stdout.flush()
def _set_project_files(self): """Sets the project path, log, jobs, config, and results paths for the project. project_path - parent directory for this project config_path - parent directory for all config files within this project job_path - parent directory for all job files within this project log_path - parent directory for all log files within this project results_path - parent directory for all results files within this project report_path - parent directory for all report files within this project """ self.files = { "project_path": os.path.abspath(self.args.output_directory), "config_path": os.path.join(os.path.abspath(self.args.output_directory), "config"), "log_path": os.path.join(os.path.abspath(self.args.output_directory), "logs"), "results_path": os.path.join(os.path.abspath(self.args.output_directory), "results"), "report_path": os.path.join(os.path.abspath(self.args.output_directory), "report"), } [check_dir(i) for i in self.files.values()]
import os from utils import check_dir, check_file # config IP = '0.0.0.0' PORT = 6677 DGRAM_FORMAT = '50s50s50s200s' CMD_FORMAT = '50s50s50s' BASE_DIR = './demo' DEBUG = True PID_FILE = os.path.join(BASE_DIR, 'pychat_server.pid') LOG_FILE = os.path.join(BASE_DIR, 'pychat_server.log') # const USER_PATH = os.path.join(BASE_DIR, 'user') MSG_PATH = os.path.join(BASE_DIR, 'msg') FILE_PATH = os.path.join(BASE_DIR, 'file') history_msg_file = os.path.join(MSG_PATH, 'history.pk') offline_msg_file = os.path.join(MSG_PATH, 'offline.pk') user_file = os.path.join(USER_PATH, 'user.pk') friend_file = os.path.join(USER_PATH, 'friends.pk') file_info = os.path.join(FILE_PATH, 'file_info.txt') check_dir(USER_PATH) check_dir(MSG_PATH) check_dir(FILE_PATH) check_file(file_info)
def __set_reporter_obj(self, pobj, settings): '''Builds the basics of the reporter_obj''' self.reporter_obj = { 'module': 'PipelineReport', 'software': { 'python': settings.system['python'] }, 'parameters': {'project': pobj.name, 'sample': self.name}, 'options': settings.options['annovar'], 'reads': { 'inputs': {'normal': [], 'tumor': []}, 'outputs': {'normal': [], 'tumor': []} }, 'alignments': {'inputs': {}, 'outputs': {}}, 'somatic': {'inputs': [], 'outputs': {}} } # INPUTS for a in ['bwa_aln', 'bwa_mem', 'novoalign']: if a in settings.aln_list: # ALIGNMENTS self.reporter_obj['alignments']['inputs'][a] = { 'normal': { 'alignment_summary_metrics': os.path.join(pobj.files['results_path'], \ '{0.name}/01_Alignments/{1}/{0.name}-NORMAL.{1}.metrics.alignment_summary_metrics'.format( self, a)), 'insert_size_metrics': os.path.join(pobj.files['results_path'], \ '{0.name}/01_Alignments/{1}/{0.name}-NORMAL.{1}.metrics.insert_size_metrics'.format( self, a)), 'total_coverage': os.path.join(pobj.files['results_path'], \ '{0.name}/01_Alignments/{1}/{0.name}-NORMAL.{1}.{2}.exons.bed'.format( self, a, pobj.assembly['refname'])) }, 'tumor': { 'alignment_summary_metrics': os.path.join(pobj.files['results_path'], \ '{0.name}/01_Alignments/{1}/{0.name}-TUMOR.{1}.metrics.alignment_summary_metrics'.format( self, a)), 'insert_size_metrics': os.path.join(pobj.files['results_path'], \ '{0.name}/01_Alignments/{1}/{0.name}-TUMOR.{1}.metrics.insert_size_metrics'.format( self, a)), 'total_coverage': os.path.join(pobj.files['results_path'], \ '{0.name}/01_Alignments/{1}/{0.name}-TUMOR.{1}.{2}.exons.bed'.format( self, a, pobj.assembly['refname'])) }, } # SMD for s in ['mutect', 'shimmer', 'sniper', 'strelka', 'varscan', 'virmid']: if s in settings.smd_list: self.reporter_obj['somatic']['inputs'].append({ 'aln': a, 'smd': s, 'annovar': os.path.join(pobj.files['results_path'], \ '{0.name}/04_VariantAnnotation/'.format(self) + \ '{0.name}.{1}.{2}.vcf.annovar.anno.{3}_multianno.txt'.format( self, s, a, pobj.assembly['refname'])), 'vcf': os.path.join(pobj.files['results_path'], \ '{0.name}/03_SomaticMutations/'.format(self) + \ '{0.name}.{1}.{2}.final.vcf'.format(self, s, a)) }) # OUTPUTS self.reporter_obj['alignments']['outputs'] = { 'alignment_summary_metrics': os.path.join(pobj.files['report_path'], \ 'project_alignments/{0.name}.alignment_summary_metrics.txt'.format(self)), 'insert_size_metrics': os.path.join(pobj.files['report_path'], \ 'project_alignments/{0.name}.insert_size_metrics.txt'.format(self)), 'total_coverage': os.path.join(pobj.files['report_path'], \ 'project_alignments/{0.name}.total_coverage.txt'.format(self)), 'path': os.path.join(pobj.files['report_path'], 'project_alignments') } self.reporter_obj['somatic']['outputs'] = { 'smd_snp_table': os.path.join(pobj.files['report_path'], \ 'project_somatic/{0.name}.somatic.snps.tsv'.format(self)) } # Append to reporter_files anames = ['alignment_summary_metrics', 'insert_size_metrics', 'total_coverage'] [self.reporter_files.append(self.reporter_obj['alignments']['outputs'][i]) for i in anames] self.reporter_files.append(self.reporter_obj['somatic']['outputs']['smd_snp_table']) # Check output paths utils.check_dir(self.reporter_obj['alignments']['outputs']['path']) utils.check_dir(os.path.abspath(os.path.dirname(self.reporter_obj['somatic']['outputs']['smd_snp_table'])))
def parse_args(): desc = 'TensorFlow 2.0 implementation of Residual Attribute Generative Adversarial Network (RAG)' parser = argparse.ArgumentParser(description=desc) parser.add_argument('--dataset_name', type=str, default='celeba') parser.add_argument('--phase', type=str, default='tfrecord', choices=('tfrecord', 'train', 'test')) parser.add_argument('--img_size', type=int, default=128) parser.add_argument('--img_nc', type=int, default=3) parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--lr', type=float, default=0.0001) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--decay_epochs', type=int, default=10) parser.add_argument('--w_adv', type=float, default=1) parser.add_argument('--w_cls', type=float, default=10) parser.add_argument('--w_cyc', type=float, default=10) parser.add_argument('--w_rec', type=float, default=10) parser.add_argument('--w_a', type=float, default=1) parser.add_argument('--w_tv', type=float, default=2.5) parser.add_argument('--gan_type', type=str, default='lsgan', choices=('vanilla', 'lsgan', 'hinge')) parser.add_argument('--log_freq', type=int, default=1000) parser.add_argument('--output_dir', type=str, default='output') parser.add_argument('--log_dir', type=str, default='log') parser.add_argument('--sample_dir', type=str, default='sample') parser.add_argument('--save_dir', type=str, default='model') parser.add_argument('--result_dir', type=str, default='result') parser.add_argument('--test_img', type=str, default='000009.jpg') args = parser.parse_args() check_dir(args.output_dir) args.output_dir = os.path.join(args.output_dir, f'RAG_{args.dataset_name}') check_dir(args.output_dir) args.log_dir = os.path.join(args.output_dir, args.log_dir) check_dir(args.log_dir) args.sample_dir = os.path.join(args.output_dir, args.sample_dir) check_dir(args.sample_dir) args.save_dir = os.path.join(args.output_dir, args.save_dir) check_dir(args.save_dir) args.result_dir = os.path.join(args.output_dir, args.result_dir) check_dir(args.result_dir) if args.dataset_name == 'celeba': args.shorter_size = 178 args.attrs = [ 'Black_Hair', 'Blond_Hair', 'Brown_Hair', 'Male', 'Young', 'Eyeglasses', 'Mouth_Slightly_Open', 'Pale_Skin', 'Rosy_Cheeks', 'Smiling', 'Heavy_Makeup' ] args.label_nc = len(args.attrs) return args
) dloader_val = data_loader( dataset=dataset_val, nKnovel=opt.test_way, nKbase=0, nExemplars=opt.val_shot, # num training examples per novel category nTestNovel=opt.val_query * opt.test_way, # num test examples for all the novel categories nTestBase=0, # num test examples for all the base categories batch_size=1, num_workers=0, epoch_size=1 * opt.val_episode, # num of batches per epoch ) set_gpu(opt.gpu) check_dir('./experiments/') check_dir(opt.save_path) log_file_path = os.path.join(opt.save_path, "train_log.txt") log(log_file_path, str(vars(opt))) (embedding_net, cls_head) = get_model(opt) optimizer = torch.optim.SGD([{'params': embedding_net.parameters()}, {'params': cls_head.parameters()}], lr=0.1, momentum=0.9, \ weight_decay=5e-4, nesterov=True) lambda_epoch = lambda e: 1.0 if e < 20 else (0.06 if e < 40 else 0.012 if e < 50 else (0.0024)) lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_epoch, last_epoch=-1) max_val_acc = 0.0
IP = '127.0.0.1' PORT = 6677 DGRAM_FORMAT = '50s50s50s200s' CMD_FORMAT = '50s50s50s' BASE_DIR = './demo' DEBUG = True PID_FILE = os.path.join(BASE_DIR, 'pychat_server.pid') LOG_FILE = os.path.join(BASE_DIR, 'pychat_server.log') # const USER_PATH = os.path.join(BASE_DIR, 'user') MSG_PATH = os.path.join(BASE_DIR, 'msg') FILE_PATH = os.path.join(BASE_DIR, 'file') history_msg_file = os.path.join(MSG_PATH, 'history.pk') offline_msg_file = os.path.join(MSG_PATH, 'offline.pk') user_file = os.path.join(USER_PATH, 'user.pk') friend_file = os.path.join(USER_PATH, 'friends.pk') file_info = os.path.join(FILE_PATH, 'file_info.txt') check_dir(USER_PATH) check_dir(MSG_PATH) check_dir(FILE_PATH) check_file(file_info)
def train(model, train_loader, test_loader, dev_loader, optimizer, conf, logger): scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=conf['num_epochs'] * len(train_loader), eta_min=1e-6) model.train() best_random_error = 100.0 iter_per_epoch = len(train_loader) if conf['rank'] == 0: summary_dir = os.path.join(conf['exp_dir'], 'tensorX_log') check_dir(summary_dir) tb_writer = SummaryWriter(summary_dir) for epoch in range(conf['num_epochs']): acc_sum = 0. # Accuracy epoch_loss = 0.0 model.train() if conf['rank'] == 0: t_bar = tqdm(ncols=100, total=iter_per_epoch, desc='Epoch:{}'.format(epoch)) for iter_idx, (images, labels, loss_weight) in enumerate(train_loader): if conf['rank'] == 0: t_bar.update() images = images.to(conf['device']) labels = labels.to(conf['device']) loss_weight = loss_weight.to(conf['device']) optimizer.zero_grad() seg_res = model(images) seg_prob = torch.sigmoid(seg_res) seg_res_flat = seg_res.view(seg_res.size(0), -1) labels_flat = labels.view(labels.size(0), -1) loss_weight_flat = loss_weight.view(loss_weight.size(0), -1) loss = F.binary_cross_entropy_with_logits(seg_res_flat, labels_flat, reduction='none') loss = torch.mean(loss * loss_weight_flat) epoch_loss += loss.item() loss.backward() optimizer.step() scheduler.step() acc = get_accuracy(seg_prob, labels) acc_sum += acc step_idx = epoch * iter_per_epoch + iter_idx if conf['rank'] == 0: tb_writer.add_scalar("acc_step", acc, step_idx) tb_writer.add_scalar("loss_step", loss.item(), step_idx) if conf['rank'] == 0: t_bar.close() acc_sum = acc_sum / iter_per_epoch epoch_loss /= iter_per_epoch current_lr = optimizer.param_groups[0]['lr'] # logger.info("[Train] Rank: {} Epoch: [{}/{}] Acc: {:.3f} Loss: {:.3f} Lr:{:.3e}".format(conf['rank'], # epoch, conf['num_epochs'], # acc, epoch_loss, current_lr)) test_acc, test_error, test_pre, test_recall, test_split, test_merge = test( model, test_loader, conf, logger, epoch, best_random_error) dev_acc, dev_error, dev_pre, dev_recall, dev_split, dev_merge = dev_eval( model, dev_loader, conf) logger.info( "[Train] Rank: {} Epoch: [{}/{}] Acc: {:.3f} Loss: {:.3f} Lr:{:.3e} " "R_error: {:.3f} R_pre: {:.3f} R_recall: {:.3f}" " F_split: {:.2f} F_merge: {:.2f}".format(conf['rank'], epoch, conf['num_epochs'], acc_sum, epoch_loss, current_lr, dev_error, dev_pre, dev_recall, dev_split, dev_merge)) if conf['rank'] == 0: tb_writer.add_scalar("test_acc", test_acc, epoch) tb_writer.add_scalar("test_error", test_error, epoch) tb_writer.add_scalar("test_pre", test_pre, epoch) tb_writer.add_scalar("test_recall", test_recall, epoch) tb_writer.add_scalar("test_split", test_split, epoch) tb_writer.add_scalar("test_merge", test_merge, epoch) tb_writer.add_scalar("train_acc", dev_acc, epoch) tb_writer.add_scalar("train_error", dev_error, epoch) tb_writer.add_scalar("train_pre", dev_pre, epoch) tb_writer.add_scalar("train_recall", dev_recall, epoch) tb_writer.add_scalar("train_split", dev_split, epoch) tb_writer.add_scalar("train_merge", dev_merge, epoch) if best_random_error > test_error and conf['rank'] == 0: best_random_error = test_error save_name = 'Best' state_dict = {'model': model.module.state_dict()} save_checkpoint(state_dict, conf['checkpoint_format'].format(save_name)) if epoch % conf['save_per_epoch'] == 0 and conf['rank'] == 0: save_name = 'Epoch-{}'.format(epoch) state_dict = {'model': model.module.state_dict()} save_checkpoint(state_dict, conf['checkpoint_format'].format(save_name)) if conf['rank'] == 0: tb_writer.close()
def main(): from version import __version__ import circ import pipeline from logger import get_logger from utils import check_file, check_dir, check_config, get_thread_num from utils import CIRCparser, TOOLS # Init argparser parser = argparse.ArgumentParser(prog='CIRIquant') # required arguments parser.add_argument( '--config', dest='config_file', metavar='FILE', help='Config file in YAML format', ) parser.add_argument( '-1', '--read1', dest='mate1', metavar='MATE1', help='Input mate1 reads (for paired-end data)', ) parser.add_argument( '-2', '--read2', dest='mate2', metavar='MATE2', help='Input mate2 reads (for paired-end data)', ) # optional arguments parser.add_argument( '-o', '--out', dest='output', metavar='DIR', default=None, help='Output directory, default: ./', ) parser.add_argument( '-p', '--prefix', dest='prefix', metavar='PREFIX', default=None, help='Output sample prefix, default: input sample name', ) parser.add_argument( '-t', '--threads', dest='cpu_threads', default=4, metavar='INT', help='Number of CPU threads, default: 4', ) parser.add_argument( '-a', '--anchor', dest='anchor', default=5, metavar='INT', help='Minimum anchor length for junction alignment, default: 5', ) parser.add_argument( '-l', '--libary-type', dest='library_type', metavar='INT', default=0, help='Library type, 0: unstranded, 1: read1 match the sense strand,' '2: read1 match the antisense strand, default: 0', ) parser.add_argument( '-v', '--verbose', dest='verbosity', default=False, action='store_true', help='Run in debugging mode', ) parser.add_argument( '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) parser.add_argument( '-e', '--log', dest='log_file', default=None, metavar='LOG', help='Log file, default: out_dir/prefix.log', ) # provide pre-defined list of circRNAs parser.add_argument( '--bed', dest='bed', metavar='FILE', default=None, help='bed file for putative circRNAs (optional)', ) parser.add_argument( '--circ', dest='circ', metavar='FILE', default=None, help='circRNA prediction results from other softwares', ) parser.add_argument( '--tool', dest='tool', metavar='TOOL', default=None, help='circRNA prediction tool, required if --circ is provided', ) # when provide RNase R result, do RNase R correction parser.add_argument( '--RNaseR', dest='rnaser', metavar='FILE', default=None, help='CIRIquant result of RNase R sample', ) # skip hisat2 alignment for RNA-seq data parser.add_argument( '--bam', dest='bam', metavar='BAM', default=None, help='hisat2 alignment to reference genome', ) # skip stringtie prediction parser.add_argument( '--no-gene', dest='gene_exp', default=False, action='store_true', help='Skip stringtie estimation for gene abundance', ) args = parser.parse_args() """Check required parameters""" # check input reads if args.mate1 and args.mate2: reads = [check_file(args.mate1), check_file(args.mate2)] else: sys.exit( 'No input files specified, please see manual for detailed information' ) try: lib_type = int(args.library_type) except ValueError: sys.exit( 'Wrong library type, please check your command.\nSupported types:\n0 - unstranded;\n' '1 - read1 match the sense strand;\n2 - read1 match the antisense strand;' ) if lib_type not in [0, 1, 2]: sys.exit( 'Wrong library type, please check your command.\nSupported types:\n0 - unstranded;\n' '1 - read1 match the sense strand;\n2 - read1 match the antisense strand;' ) # check configuration if args.config_file: config = check_config(check_file(args.config_file)) else: sys.exit( 'A config file is needed, please see manual for detailed information.' ) """Check optional parameters""" # use circRNA bed file if provided bed_file = check_file(args.bed) if args.bed else None circ_file = check_file(args.circ) if args.circ else None circ_tool = args.tool # user provided RNase R CIRIquant results rnaser_file = check_file(args.rnaser) if args.rnaser else None # pre aligned hisat2 bam hisat_bam = check_file(args.bam) if args.bam else None # Output prefix if args.prefix is None: try: prefix = re.search(r'(\S+)[_/-][12]', os.path.basename(reads[0])).group(1) except AttributeError: sys.exit( 'Ambiguous sample name, please manually select output prefix') else: prefix = args.prefix # check output dir outdir = './' + prefix if args.output is None else args.output outdir = check_dir(outdir) # Parse arguments log_file = os.path.abspath( args.log_file) if args.log_file else '{}/{}.log'.format( outdir, prefix) verbosity = args.verbosity logger = get_logger('CIRIquant', log_file, verbosity) # Add lib to PATH lib_path = os.path.dirname(os.path.split( os.path.realpath(__file__))[0]) + '/libs' os.environ['PATH'] = lib_path + ':' + os.environ['PATH'] os.chmod(lib_path + '/CIRI2.pl', 0o755) """Start Running""" os.chdir(outdir) logger.info( 'Input reads: ' + ','.join([os.path.basename(args.mate1), os.path.basename(args.mate2)])) if lib_type == 0: lib_name = 'unstranded' elif lib_type == 1: lib_name = 'ScriptSeq' elif lib_type == 2: lib_name = 'TAKARA SMARTer' else: sys.exit( 'Unsupported library type, please check the manual for instructions.' ) logger.info('Library type: {}'.format(lib_name)) logger.info('Output directory: {}, Output prefix: {}'.format( outdir, prefix)) logger.info('Config: {} Loaded'.format(config)) thread = get_thread_num(int(args.cpu_threads)) anchor = int(args.anchor) # Step1: Data Preparation # Step1.1: HISAT2 mapping if hisat_bam is None: logger.info('Align RNA-seq reads to reference genome ..') hisat_bam = pipeline.align_genome(log_file, thread, reads, outdir, prefix) else: logger.info( 'HISAT2 alignment bam provided, skipping alignment step ..') logger.debug('HISAT2 bam: {}'.format(os.path.basename(hisat_bam))) # Step1.2: Estimate Gene Abundance if args.gene_exp: logger.info('Skipping gene abundance estimation') else: pipeline.gene_abundance(log_file, thread, outdir, prefix, hisat_bam) # Step3: run CIRI2 if bed_file: logger.info( 'Using user-provided circRNA bed file: {}'.format(bed_file)) else: if circ_file or circ_tool: if circ_file and circ_tool: logger.info( 'Using predicted circRNA results from {}: {}'.format( circ_tool, circ_file)) circ_parser = CIRCparser(circ_file, circ_tool) else: sys.exit( '--circ and --tool must be provided in the same time!') else: logger.info( 'No circRNA information provided, run CIRI2 for junction site prediction ..' ) bwa_sam = pipeline.run_bwa(log_file, thread, reads, outdir, prefix) ciri_file = pipeline.run_ciri(log_file, thread, bwa_sam, outdir, prefix) circ_parser = CIRCparser(ciri_file, 'CIRI2') bed_file = '{}/{}.bed'.format(outdir, prefix) circ_parser.convert(bed_file) # Step4: estimate circRNA expression level out_file = circ.proc(log_file, thread, bed_file, hisat_bam, rnaser_file, reads, outdir, prefix, anchor, lib_type) # Remove temporary files pipeline.clean_tmp(outdir, prefix) logger.info('circRNA Expression profile: {}'.format( os.path.basename(out_file))) logger.info('Finished!')
negative_words = pd.read_csv("../asset/negative-words.txt", header=None, encoding='latin-1') positive_words_list = convert_words_list(positive_words) #remove word trump from positive word list positive_words_list = [ i for i in positive_words_list if i not in "trump" ] negative_words_list = convert_words_list(negative_words) df = scoring_tweets(df, "text", positive_words_list, negative_words_list) print("Tagging Finished !") # save check_dir(opt.output_path) df.to_csv(os.path.join(opt.output_path, "{}.csv".format("tagging")), index=False) #------------------------ # exp #------------------------ df = pd.read_csv( os.path.join(opt.output_path, "{}.csv".format("tagging"))) df.dropna(subset=["text"], inplace=True) print("num_data : ", len(df)) df_train = df.sample(frac=0.8) print("num_training_data : ", len(df_train)) df_test = df[~df.index.isin(df_train.index)] print("num_testing_data : ", len(df_test)) assert len(df_train) + len(df_test) == len( df), "it should be the same."
import os import numpy as np import viz from dataset import load_dataset from utils import check_dir dataset_id = "omniscient" datasets_dir = "datasets/" runs_dir = os.path.join(datasets_dir, dataset_id) videos_dir = os.path.join(runs_dir, 'videos') check_dir(videos_dir) # Load the dataset dataset = load_dataset(runs_dir) dataset.load() # Select the last step of each run last_steps = dataset.groupby("run").map(lambda x: x.isel(sample=-1)) runs = last_steps.run # Only choose runs that we're interrupted before reaching the goal. # runs = runs.where(last_steps.goal_reached == False) n_runs = 10 for _ in range(n_runs): run_id = np.random.choice(runs) run = dataset.where(dataset.run == run_id, drop=True)
def __init__(self, batch_size, en_optimizer, de_optimizer, en_learning_rate, de_learning_rate, attn_method, train_data_engine, test_data_engine, use_embedding, en_use_attr_init_state, en_hidden_size=100, de_hidden_size=100, en_vocab_size=None, de_vocab_size=None, vocab_size=None, en_embedding_dim=None, de_embedding_dim=None, embedding_dim=None, embeddings=None, en_embedding=True, share_embedding=True, n_decoders=2, cell="GRU", n_en_layers=1, n_de_layers=1, bidirectional=False, feed_last=False, repeat_input=False, batch_norm=False, model_dir="./model", log_dir="./log", is_load=True, check_mem_usage_batches=0, replace_model=True, finetune_embedding=False, model_config=None): # Initialize attributes self.data_engine = train_data_engine self.check_mem_usage_batches = check_mem_usage_batches self.n_decoders = n_decoders self.log_dir = log_dir self.model_dir = model_dir self.en_embedding_dim = en_embedding_dim self.de_embedding_dim = de_embedding_dim self.embedding_dim = embedding_dim self.repeat_input = repeat_input # Initialize embeddings, encoders and decoders """ There are some available options here, most of which matter when using E2E dataset. (You still can use them while using dialogue generation dataset like CMDC, but it's NOT RECOMMENDED.) 1) en_embedding (default True): If the option is on, we're going to add embedding layer into encoder; otherwise, the one-hot vectors are directly fed into encoder's RNN. For now, the decoder always has an embedding layer; this is because that we assumed that the decoder should always output the natural language, and it's reasonable that using an embedding layer instead of directly pass one-hot vectors into RNN. 2) share_embedding (default True): If the option is on, first you should make sure that the input of encoder and decoder are in same vector space, (e.g. both natural language); otherwise, it will cause some strange result, (it is possible that you can train the model without any error, but the shared embedding layer doesn't make sense, as you should know.) When the option is on, the embedding dimension will be the argument embedding_dim, and the vocabulary size will be vocab_size; the argument en_embedding_dim, de_embedding_dim, en_vocab_size and de_vocab_size won't be used. 3) use_embedding (default True): When the option is on: (1) If share_embedding option is on, the shared embedding will be initialized with the embeddings we pass into the model. (2) If en_embedding is on while share_embedding option being off, only the embedding in decoder will be initialized with the pre-trained embeddings, and the encoder embeddings will be trained from scratch (this combination of options is NOT APPROPRIATE when using dialogue generation dataset, as you should know, it's kind of strange that we only initialize the embedding in decoder when both input and output of the encoder and decoder are in same vector space.) As mentioned above, since that the options are not disjoint, I'll list some possible combination below, which are reasonable to be tested and compared: 1) en_embedding=True, share_embedding=True, \ use_embedding=True (dialogue generation) 2) en_embedding=True, share_embedding=True, \ use_embedding=False (dialogue generation) 3) en_embedding=True, share_embedding=False, \ use_embedding=True (semantic form to NL) 4) en_embedding=False, share_embedding=X(don't care), \ use_embedding=True (semantic form to NL) 5) en_embedding=True, share_embedding=False, \ use_embedding=False (semantic form to NL) 6) en_embedding=False, share_embedding=X(don't care), \ use_embedding=False (semantic form to NL) """ # embedding layer setting if not en_embedding: en_embed = None de_embed = nn.Embedding(de_vocab_size, de_embedding_dim) if use_embedding: de_embed.weight = embeddings if not finetune_embedding: de_embed.weight.requires_grad = False else: if share_embedding: embed = nn.Embedding(vocab_size, embedding_dim) if use_embedding: embed.weight = embeddings if not finetune_embedding: embed.weight.requires_grad = False en_embed = embed de_embed = embed else: en_embed = nn.Embedding(en_vocab_size, en_embedding_dim) de_embed = nn.Embedding(de_vocab_size, de_embedding_dim) if use_embedding: # in E2ENLG dataset, only decoder use word embedding de_embed.weight = embeddings if not finetune_embedding: de_embed.weight.requires_grad = False self.encoder = EncoderRNN( en_embedding=en_embedding, embedding=en_embed, en_vocab_size=en_vocab_size, en_embedding_dim=(embedding_dim if share_embedding and en_embedding else en_embedding_dim), hidden_size=en_hidden_size, n_layers=n_en_layers, bidirectional=bidirectional, cell=cell) self.cell = cell self.decoders = [] for n in range(n_decoders): decoder = DecoderRNN( embedding=de_embed, de_vocab_size=de_vocab_size, de_embedding_dim=(embedding_dim if share_embedding and en_embedding else self.de_embedding_dim), en_hidden_size=en_hidden_size, de_hidden_size=de_hidden_size, n_en_layers=n_en_layers, n_de_layers=n_de_layers, bidirectional=bidirectional, feed_last=(True if feed_last and n > 0 else False), batch_norm=batch_norm, attn_method=attn_method, cell=cell) self.decoders.append(decoder) self.encoder = self.encoder.cuda() if use_cuda else self.encoder self.decoders = [ decoder.cuda() if use_cuda else decoder for decoder in self.decoders ] # Initialize data loaders and optimizers self.train_data_loader = DataLoader(train_data_engine, batch_size=batch_size, shuffle=True, num_workers=1, drop_last=True, collate_fn=collate_fn, pin_memory=True) self.test_data_loader = DataLoader(test_data_engine, batch_size=batch_size, shuffle=False, num_workers=1, drop_last=True, collate_fn=collate_fn, pin_memory=True) # encoder parameters optimization self.encoder_parameters = filter(lambda p: p.requires_grad, self.encoder.parameters()) self.encoder_optimizer = build_optimizer(en_optimizer, self.encoder_parameters, en_learning_rate) # decoder parameters optimization decoder_parameters = [] for decoder in self.decoders: decoder_parameters.extend(list(decoder.parameters())) self.decoder_parameters = filter(lambda p: p.requires_grad, decoder_parameters) self.decoder_optimizer = build_optimizer(de_optimizer, self.decoder_parameters, de_learning_rate) print_time_info("Model create complete") # check directory and model existence Y, M, D, h, m, s = get_time() if not replace_model: self.model_dir = os.path.join( self.model_dir, "{}{:0>2}{:0>2}_{:0>2}{:0>2}{:0>2}".format(Y, M, D, h, m, s)) if not os.path.isdir(self.model_dir): os.makedirs(self.model_dir) else: if not is_load: check_dir(self.model_dir) self.log_dir = os.path.join( self.log_dir, "{}{:0>2}{:0>2}_{:0>2}{:0>2}{:0>2}".format(Y, M, D, h, m, s)) if not os.path.isdir(self.log_dir): os.makedirs(self.log_dir) os.makedirs(os.path.join(self.log_dir, "validation")) with open(os.path.join(self.log_dir, "model_config"), "w+") as f: for arg in vars(model_config): f.write("{}: {}\n".format(arg, str(getattr(model_config, arg)))) f.close() if is_load: self.load_model(model_dir) # Initialize the log files self.logger = Logger(self.log_dir) self.train_log_path = os.path.join(self.log_dir, "train_log.csv") self.valid_batch_log_path = os.path.join(self.log_dir, "valid_batch_log.csv") self.valid_epoch_log_path = os.path.join(self.log_dir, "valid_epoch_log.csv") with open(self.train_log_path, 'w') as file: file.write("epoch, batch, loss, avg-bleu, avg-rouge(1,2,L,BE)\n") with open(self.valid_batch_log_path, 'w') as file: file.write("epoch, batch, loss, avg-bleu, avg-rouge(1,2,L,BE)\n") with open(self.valid_epoch_log_path, 'w') as file: file.write("epoch, loss, avg-bleu, avg-rouge(1,2,L,BE)\n") # Initialize batch count self.batches = 0 self.en_use_attr_init_state = en_use_attr_init_state
import os import re import numpy as np from numpy import random from utils import check_dir from scipy.io import loadmat from scipy.io import savemat #联合频域和LSTM特征再保存 LSTM_feature_path = "../LSTM_feature" frequency_feature_path = "../frequency_feature/out" stack_feature_path = "../stack_feature" check_dir(stack_feature_path) sample_list = os.listdir(LSTM_feature_path) for sample in sample_list: loaded_sample_LSTM = loadmat(os.path.join(LSTM_feature_path, sample)) LSTM_feature = loaded_sample_LSTM['temp'] loaded_sample_frequency = loadmat( os.path.join(frequency_feature_path, sample[0:sample.index('.')] + '_frequency.mat')) frequency_feature = loaded_sample_frequency['data'] stack_feature = np.hstack((LSTM_feature, frequency_feature)) savemat(os.path.join(stack_feature_path, sample), {'temp': stack_feature}) path = '../stack_feature' output_path = '../txt/stack_feature' check_dir(path) check_dir(output_path) data = []
def proc(log_file, thread, circ_file, hisat_bam, rnaser_file, reads, outdir, prefix, anchor, lib_type): """ Build pseudo circular reference index and perform reads re-alignment Extract BSJ and FSJ reads from alignment results Returns ----- str output file name """ from utils import check_dir circ_dir = '{}/circ'.format(outdir) check_dir(circ_dir) circ_fasta = '{}/circ/{}_index.fa'.format(outdir, prefix) circ_info = load_bed(circ_file) if rnaser_file: LOGGER.info('Loading RNase R results') rnaser_exp, rnaser_stat = update_info(circ_info, rnaser_file) # extract fasta file for reads alignment generate_index(log_file, circ_info, circ_fasta) # hisat2-build index denovo_index = build_index(log_file, thread, circ_fasta, outdir, prefix) LOGGER.debug('De-novo index: {}'.format(denovo_index)) # hisat2 de novo alignment for candidate reads denovo_bam = denovo_alignment(log_file, thread, reads, outdir, prefix) LOGGER.debug('De-novo bam: {}'.format(denovo_bam)) # Find BSJ and FSJ informations cand_bsj = proc_denovo_bam(denovo_bam, thread, circ_info, anchor, lib_type) bsj_reads, fsj_reads = proc_genome_bam(hisat_bam, thread, circ_info, cand_bsj, anchor, circ_dir) total_reads, mapped_reads = bam_stat(hisat_bam) circ_reads = sum([len(bsj_reads[i]) for i in bsj_reads]) * 2 sample_stat = (total_reads, mapped_reads, circ_reads) sample_exp = expression_level(circ_info, bsj_reads, fsj_reads) # circRNA annotation header = [ 'Sample: {}'.format(prefix), 'Total_Reads: {}'.format(total_reads), 'Mapped_Reads: {}'.format(mapped_reads), 'Circular_Reads: {}'.format(circ_reads), ] out_file = '{}/{}.gtf'.format(outdir, prefix) if rnaser_file: import coeff tmp_header, circ_exp = coeff.correction(sample_exp, sample_stat, rnaser_exp, rnaser_stat) header += tmp_header else: circ_exp = sample_exp from version import __version__ header += ['version: {}'.format(__version__), ] gtf_info = index_annotation(utils.GTF) format_output(circ_info, circ_exp, sample_stat, header, gtf_info, out_file) return out_file
from config import * import os import numpy as np import mxnet as mx from model.simple_stack import SimpleStack from utils import check_dir from memory import Memory from environments.SimpleEnv import SimpleEnv from mxnet import gluon if os.path.exists(summary): os.remove(summary) ctx = mx.cpu() for i in ["model_save", "data_save"]: check_dir(i) # build models online_model = SimpleStack() offline_model = SimpleStack() online_model.collect_params().initialize(mx.init.Normal(0.02), ctx=ctx) offline_model.collect_params().initialize(mx.init.Normal(0.02), ctx=ctx) offline_model.collect_params().zero_grad() # create env env = SimpleEnv(display=True) env.reset_env() memory_pool = Memory(memory_length) annealing = 0 total_reward = np.zeros(num_episode) eval_result = [] loss_func = gluon.loss.L2Loss() trainer = gluon.Trainer(offline_model.collect_params(), 'adam', {'learning_rate': lr})
def mode_train(train_loader, dev_loader, train_size_aug, dev_size_aug): check_dir(save_root_dir + '/' + model_name) device = torch.device('cuda') if model_pretrained: print('Loading pretrained model from {}'.format(save_root_dir + '/' + model_pretrained + '/model.pth')) model = torch.load(save_root_dir + '/' + model_pretrained + '/model.pth', map_location=device) else: model = VSNet(num_classes=num_classes) model = nn.DataParallel(model, device_ids=[0, 1, 2, 3]) # criterion = nn.MSELoss(reduction='sum') optimizer = optim.Adam(model.parameters(), lr=learning_rate) # optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9) model.to(device) scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=gamma) tb.configure(save_root_dir + '/' + model_name) start_time = time.time() tb_count = 0 for epoch in range(num_epochs): scheduler.step() # Training model.train() running_loss = 0.0 for i, sample in enumerate(train_loader, 0): if i == 1 and epoch == 0: start_time = time.time() img_a, img_b, label = sample optimizer.zero_grad() img_a = img_a.to(device) img_b = img_b.to(device) label = label.to(device) output = model(img_a, img_b) loss = combined_loss_quat(output, label, weights=weights) loss.backward() optimizer.step() running_loss += loss.item() * output.shape[0] output = output.cpu().detach().numpy() label = label.cpu().detach().numpy() error = np.zeros(8) for j in range(output.shape[0]): error[:3] += np.abs(output[j, :3] - label[j, :3]) quat_output = normalize_q(output[j, 3:]) quat_label = label[j, 3:] axis_output, angle_output = axis_angle_from_quat(quat_output) axis_label, angle_label = axis_angle_from_quat(quat_label) error_mag = np.abs(angle_output - angle_label) error_mag = error_mag if error_mag < np.pi else error_mag - np.pi error_dir = angle_between_vectors(axis_output, axis_label) error[3] += np.nan_to_num(error_mag) error[4] += np.nan_to_num(error_dir) rpy_output = np.array(euler_from_quaternion(quat_output)) rpy_label = np.array(euler_from_quaternion(quat_label)) error[5:] += np.abs(rpy_output - rpy_label) error /= output.shape[0] error[:3] *= 1000 error[3:] = np.rad2deg(error[3:]) est_time = (time.time() - start_time) / (epoch * len(train_loader) + i + 1) * ( num_epochs * len(train_loader)) est_time = str(datetime.timedelta(seconds=est_time)) print( '[TRAIN][{}][EST:{}] Epoch {}, Batch {}, Loss = {:0.7f}, error: x={:0.2f}mm,y={:0.2f}mm,z={:0.2f}mm,mag={:0.2f}deg,dir={:0.2f}deg,roll={:0.2f}deg,pitch={:0.2f}deg,yaw={:0.2f}deg'.format( time.time() - start_time, est_time, epoch + 1, i + 1, loss.item(), *error)) tb.log_value(name='Loss', value=loss.item(), step=tb_count) tb.log_value(name='x/mm', value=error[0], step=tb_count) tb.log_value(name='y/mm', value=error[1], step=tb_count) tb.log_value(name='z/mm', value=error[2], step=tb_count) tb.log_value(name='mag/deg', value=error[3], step=tb_count) tb.log_value(name='dir/deg', value=error[4], step=tb_count) tb.log_value(name='roll/deg', value=error[5], step=tb_count) tb.log_value(name='pitch/deg', value=error[6], step=tb_count) tb.log_value(name='yaw/deg', value=error[7], step=tb_count) tb_count += 1 # Dev eval model.eval() with torch.no_grad(): running_error_dev = np.zeros(8) # running_error_dev = np.zeros(2) for i, sample in enumerate(dev_loader, 0): img_a, img_b, label = sample img_a = img_a.to(device) img_b = img_b.to(device) output = model(img_a, img_b) output = output.cpu().detach().numpy() label = label.numpy() error = np.zeros(8) # error = np.zeros(2) for j in range(output.shape[0]): error[:3] += np.abs(output[j, :3] - label[j, :3]) quat_output = normalize_q(output[j, 3:]) quat_label = label[j, 3:] axis_output, angle_output = axis_angle_from_quat(quat_output) axis_label, angle_label = axis_angle_from_quat(quat_label) error_mag = np.abs(angle_output - angle_label) error_mag = error_mag if error_mag < np.pi else error_mag - np.pi error_dir = angle_between_vectors(axis_output, axis_label) error[3] += np.nan_to_num(error_mag) error[4] += np.nan_to_num(error_dir) rpy_output = np.array(euler_from_quaternion(quat_output)) rpy_label = np.array(euler_from_quaternion(quat_label)) error[5:] += np.abs(rpy_output - rpy_label) error[:3] *= 1000 error[3:] = np.rad2deg(error[3:]) running_error_dev += error error /= output.shape[0] print( '[EVAL][{}] Epoch {}, Batch {}, error: x={:0.2f}mm,y={:0.2f}mm,z={:0.2f}mm,mag={:0.2f}deg,dir={:0.2f}deg'.format( time.time() - start_time, epoch + 1, i + 1, *error)) average_loss = running_loss / train_size_aug average_error = running_error_dev / dev_size_aug print( '[SUMMARY][{}] Summary: Epoch {}, loss = {:0.7f}, dev_eval: x={:0.2f}mm,y={:0.2f}mm,z={:0.2f}mm,mag={:0.2f}deg,dir={:0.2f}deg,roll={:0.2f}deg,pitch={:0.2f}deg,yaw={:0.2f}deg\n\n'.format( time.time() - start_time, epoch + 1, average_loss, *average_error)) tb.log_value(name='Dev loss', value=average_loss, step=epoch) tb.log_value(name='Dev x/mm', value=average_error[0], step=epoch) tb.log_value(name='Dev y/mm', value=average_error[1], step=epoch) tb.log_value(name='Dev z/mm', value=average_error[2], step=epoch) tb.log_value(name='Dev mag/deg', value=average_error[3], step=epoch) tb.log_value(name='Dev dir/deg', value=average_error[4], step=epoch) tb.log_value(name='Dev roll/deg', value=average_error[5], step=epoch) tb.log_value(name='Dev pitch/deg', value=average_error[6], step=epoch) tb.log_value(name='Dev yaw/deg', value=average_error[7], step=epoch) torch.save(model, save_root_dir + '/' + model_name + '/model.pth') print('Model saved at {}/{}/model.pth'.format(save_root_dir, model_name))
import discord import logging from discord.ext import commands import utils import sys import os CONFIG_FILE = 'discordbot.config' options = utils.get_opts(sys.argv[1:]) if not utils.check_dir('logs'): os.mkdir('logs') else: logger = logging.getLogger('discord') logger.setLevel(logging.INFO) # Change this to get DEBUG info if necessary handler = logging.FileHandler(filename='logs/discordbot.log', encoding='utf-8', mode='w') handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s:%(name)s: %(message)s')) logger.addHandler(handler) if options.config: config = utils.read_config(file=options.config) else: config = utils.read_config() logger.info(f'Reading Configuration file: {config}') logger.info('Starting bot...') bot = commands.Bot(command_prefix=utils.get_prefix, description=config['description'])
argparser.add_argument("--alignroot", metavar="Alignment Dir", type=str, required=True) argparser.add_argument("--treeroot", metavar="Tree Dir", type=str, required=True) argparser.add_argument("--outroot", metavar="Output Dir + working dir + logdir", type=str, required=True) args = argparser.parse_args() alignroot = args.alignroot treeroot = args.treeroot outroot = args.outroot # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Create log directory and out directory check_dir(outroot) out_pre_dir = path.join(outroot, "out") check_dir(out_pre_dir) logdir = path.join(outroot, "logs") check_dir(logdir) if os.getcwd() != out_pre_dir: os.chdir(out_pre_dir) for infile in glob(path.join(alignroot, "*/*.phy")): print infile with open(infile, 'r') as f:
def parse_args(): desc = '''TensorFlow 2.0 implementation of Unsupervised Generative Attentional Networks with Adaptive Layer-Instance Normalization for Image-to-Image Translation (U-GAT-IT)''' parser = argparse.ArgumentParser(description=desc) parser.add_argument('--dataset_name', type=str, default='selfie2anime') parser.add_argument('--phase', type=str, default='tfrecord', choices=('tfrecord', 'train', 'test')) parser.add_argument('--img_size', type=int, default=256) parser.add_argument('--img_nc', type=int, default=3) parser.add_argument('--batch_size', type=int, default=1) parser.add_argument('--lr', type=float, default=0.0001) parser.add_argument('--iteration', type=int, default=10000) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--decay_epochs', type=int, default=50) parser.add_argument('--w_adv', type=float, default=1) parser.add_argument('--w_cyc', type=float, default=10) parser.add_argument('--w_rec', type=float, default=10) parser.add_argument('--w_cam', type=float, default=1000) parser.add_argument('--gan_type', type=str, default='lsgan', choices=('vanilla', 'lsgan', 'hinge')) parser.add_argument('--log_freq', type=int, default=1000) parser.add_argument('--output_dir', type=str, default='output') parser.add_argument('--log_dir', type=str, default='log') parser.add_argument('--sample_dir', type=str, default='sample') parser.add_argument('--save_dir', type=str, default='model') parser.add_argument('--result_dir', type=str, default='result') args = parser.parse_args() check_dir(args.output_dir) args.output_dir = os.path.join(args.output_dir, f'UGATIT_{args.dataset_name}') check_dir(args.output_dir) args.log_dir = os.path.join(args.output_dir, args.log_dir) check_dir(args.log_dir) args.sample_dir = os.path.join(args.output_dir, args.sample_dir) check_dir(args.sample_dir) args.save_dir = os.path.join(args.output_dir, args.save_dir) check_dir(args.save_dir) args.result_dir = os.path.join(args.output_dir, args.result_dir) check_dir(args.result_dir) return args
def main(): args = parser.parse_args() # torch setting torch.random.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # os setting path = args.dataset_path train_path = os.path.join(path, "train/train.txt") validation_path = os.path.join(path, "valid/valid.txt") test_path = os.path.join(path, "test/test.txt") params_path = os.path.join(args.model_dir, 'params.json') checkpoint_dir = os.path.join(args.model_dir, 'checkpoint') tensorboard_log_dir = os.path.join(args.model_dir, 'log') utils.check_dir(tensorboard_log_dir) entity2id, relation2id = data_loader.create_mappings(train_path) # params params = utils.Params(params_path) params.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # dataset train_set = data_loader.FB15KDataset(train_path, entity2id, relation2id) train_generator = torch_data.DataLoader(train_set, batch_size=params.batch_size) validation_set = data_loader.FB15KDataset(validation_path, entity2id, relation2id) validation_generator = torch_data.DataLoader( validation_set, batch_size=params.validation_batch_size) test_set = data_loader.FB15KDataset(test_path, entity2id, relation2id) test_generator = torch_data.DataLoader( test_set, batch_size=params.validation_batch_size) # model model = net.Net(entity_count=len(entity2id), relation_count=len(relation2id), dim=params.embedding_dim, margin=params.margin, device=params.device, norm=params.norm) # type: torch.nn.Module model = model.to(params.device) optimizer = optim.SGD(model.parameters(), lr=params.learning_rate) summary_writer = tensorboard.SummaryWriter(log_dir=tensorboard_log_dir) start_epoch_id = 1 step = 0 best_score = 0.0 print("Training Dataset: entity: {} relation: {} triples: {}".format( len(entity2id), len(relation2id), len(train_set))) print("Validation Dataset: triples: {}".format(len(validation_set))) print("Test Dataset: triples: {}".format(len(test_set))) print(model) # Train for epoch_id in range(start_epoch_id, params.epochs + 1): print("Epoch {}/{}".format(epoch_id, params.epochs)) loss_impacting_samples_count = 0 samples_count = 0 model.train() with tqdm(total=len(train_generator)) as t: for local_heads, local_relations, local_tails in train_generator: local_heads, local_relations, local_tails = (local_heads.to( params.device), local_relations.to( params.device), local_tails.to(params.device)) positive_triples = torch.stack( (local_heads, local_relations, local_tails), dim=1) # Preparing negatives. # Generate binary tensor to replace either head or tail. 1 means replace head, 0 means replace tail. head_or_tail = torch.randint(high=2, size=local_heads.size(), device=params.device) random_entities = torch.randint(high=len(entity2id), size=local_heads.size(), device=params.device) broken_heads = torch.where(head_or_tail == 1, random_entities, local_heads) broken_tails = torch.where(head_or_tail == 0, random_entities, local_tails) negative_triples = torch.stack( (broken_heads, local_relations, broken_tails), dim=1) optimizer.zero_grad() loss, pd, nd = model(positive_triples, negative_triples) loss.mean().backward() summary_writer.add_scalar('Loss/train', loss.mean().data.cpu().numpy(), global_step=step) summary_writer.add_scalar('Distance/positive', pd.sum().data.cpu().numpy(), global_step=step) summary_writer.add_scalar('Distance/negative', nd.sum().data.cpu().numpy(), global_step=step) loss = loss.data.cpu() loss_impacting_samples_count += loss.nonzero().size()[0] samples_count += loss.size()[0] optimizer.step() step += 1 t.set_postfix(loss=loss_impacting_samples_count / samples_count * 100) t.update() summary_writer.add_scalar('Metrics/batch_loss', loss_impacting_samples_count / samples_count * 100, global_step=epoch_id) # validation if epoch_id % params.validation_freq == 0: model.eval() _, _, hits_at_10, _ = evaluate( model=model, data_generator=validation_generator, entities_count=len(entity2id), device=params.device, summary_writer=summary_writer, epoch_id=epoch_id, metric_suffix="val") score = hits_at_10 if score > best_score: best_score = score utils.save_checkpoint(checkpoint_dir, model, optimizer, epoch_id, step, best_score) # Testing the best checkpoint on test dataset utils.load_checkpoint(checkpoint_dir, model, optimizer) best_model = model.to(params.device) best_model.eval() scores = evaluate(model=best_model, data_generator=test_generator, entities_count=len(entity2id), device=params.device, summary_writer=summary_writer, epoch_id=1, metric_suffix="test") print("Test scores: \n hit%1: {} \n hit%3: {} \nh it%10: {} \n mrr: {}". format(scores[0], scores[1], scores[2], scores[3])) eval_path = os.path.join(args.model_dir, 'eval.json') evals_params = utils.Params(eval_path) evals_params.hit_1 = scores[0] evals_params.hit_3 = scores[1] evals_params.hit_10 = scores[2] evals_params.mrr = scores[3] evals_params.best_score = best_score evals_params.save(eval_path)
def test(model, test_loader, conf, logger, epoch, best_random_error): model.eval() acc = 0. # Accuracy random_error_avg = 0.0 random_precision_avg = 0.0 random_recall_avg = 0.0 false_split_avg = 0.0 false_merge_avg = 0.0 length = 0 # here we store the 5 test images in the same big image result_store_dir = os.path.join(conf['exp_dir'], 'result') if conf['rank'] == 0: check_dir(result_store_dir) store_path_fmt = os.path.join(result_store_dir, 'epoch-{}-{}.png') # here we store each predicted image in a .png result_single_image_dir = os.path.join(conf['exp_dir'], 'result_single', 'epoch-{}'.format(epoch)) dist.barrier() with torch.no_grad(): for iter_idx, (images, labels, _) in enumerate(test_loader): images = images.to(conf['device']) labels = labels.to(conf['device']) seg_res = model(images) seg_prob = torch.sigmoid(seg_res) acc += get_accuracy(seg_prob, labels) random_error, random_precision, random_recall, false_split, false_merge = get_metric_val( labels, seg_prob) random_error_avg += random_error random_precision_avg += random_precision random_recall_avg += random_recall false_split_avg += false_split false_merge_avg += false_merge length += images.size(0) if epoch % conf['save_per_epoch'] == 0 and conf['rank'] == 0: torchvision.utils.save_image( images.data.cpu() + 0.5, store_path_fmt.format(epoch, 'image')) torchvision.utils.save_image( labels.data.cpu(), store_path_fmt.format(epoch, 'GT')) torchvision.utils.save_image( seg_prob.data.cpu(), store_path_fmt.format(epoch, 'SR')) torchvision.utils.save_image( (seg_prob > 0.5).float().data.cpu(), store_path_fmt.format(epoch, 'PRE')) check_dir(result_single_image_dir) for i in range(seg_prob.shape[0]): store_path = os.path.join(result_single_image_dir, '{}.png'.format(i)) torchvision.utils.save_image( (seg_prob > 0.5).float()[i].data.cpu(), store_path) store_path = os.path.join(result_single_image_dir, '{}-prob.png'.format(i)) torchvision.utils.save_image(seg_prob[i].data.cpu(), store_path) acc = acc / len(test_loader) random_error_avg /= len(test_loader) random_precision_avg /= len(test_loader) random_recall_avg /= len(test_loader) false_split_avg /= len(test_loader) false_merge_avg /= len(test_loader) if random_error_avg < best_random_error and conf['rank'] == 0: torchvision.utils.save_image(images.data.cpu() + 0.5, store_path_fmt.format('Best', 'image')) torchvision.utils.save_image(labels.data.cpu(), store_path_fmt.format('Best', 'GT')) torchvision.utils.save_image(seg_prob.data.cpu(), store_path_fmt.format('Best', 'SR')) torchvision.utils.save_image((seg_prob > 0.5).float().data.cpu(), store_path_fmt.format('Best', 'PRE')) result_single_image_dir = os.path.join(conf['exp_dir'], 'result_single', 'Best'.format(epoch)) check_dir(result_single_image_dir) for i in range(seg_prob.shape[0]): store_path = os.path.join(result_single_image_dir, '{}.png'.format(i)) torchvision.utils.save_image( (seg_prob > 0.5).float()[i].data.cpu(), store_path) store_path = os.path.join(result_single_image_dir, '{}-prob.png'.format(i)) torchvision.utils.save_image(seg_prob[i].data.cpu(), store_path) # if conf['rank'] == 0: # logger.info("[Test] Rank: {} Epoch: [{}/{}] Acc: {:.3f}".format(conf['rank'], # epoch, conf['num_epochs'], # acc)) if conf['rank'] == 0: logger.info( "[Test] Rank: {} Epoch: [{}/{}] Acc: {:.3f} R_error: {:.3f} R_pre: {:.3f} R_recall: {:.3f}" " F_split: {:.2f} F_merge: {:.2f}".format( conf['rank'], epoch, conf['num_epochs'], acc, random_error_avg, random_precision_avg, random_recall_avg, false_split_avg, false_merge_avg)) return acc, random_error_avg, random_precision_avg, random_recall_avg, false_split_avg, false_merge_avg
def main(): textgrid_folder = sys.argv[1] out_dir = sys.argv[2] + '/out/' utils.check_dir(out_dir) create_files(textgrid_folder, out_dir)
def format_alignment(fasta, tree, outdir): treeroot = tree fastaroot = path.join(fasta, "*/*prank.best.fas") check_dir(outdir) for infile in glob(fastaroot): # print progress print infile basename = path.basename(infile).partition('.')[0] basename = "".join(basename.split("_")[0] + "_" + basename.split("_")[1]) prefix = basename.partition('_')[0][:2] fastafile = infile treedir = path.join(treeroot, prefix) treefile = path.join(treedir, basename + '.nh') # create the first 2 directories (fasta_out, fasta_AA_out) fasta_out_dir = path.join(outdir, "fasta") check_dir(fasta_out_dir) fasta_AA_out_dir = path.join(outdir, "fasta_AA") check_dir(fasta_AA_out_dir) fasta_out_subdir = path.join(fasta_out_dir, prefix) check_dir(fasta_out_subdir) fasta_out_file_path = path.join(fasta_out_subdir, "".join(basename + ".fa")) fasta_AA_out_subdir = path.join(fasta_AA_out_dir, prefix) check_dir(fasta_AA_out_subdir) fasta_AA_out_file_path = path.join(fasta_AA_out_subdir, "".join(basename + ".fa")) fasta_out_file = open(fasta_out_file_path, "w") fasta_AA_out_file = open(fasta_AA_out_file_path, "w") for ID in SeqIO.parse(fastafile,"fasta", alphabet=IUPAC.unambiguous_dna): tree_ids = Tree(newick=treefile) for tree_id in tree_ids.iter_leaf_names(): if tree_id.find(ID.id) != -1: #print ID.id ID.id = tree_id #ID.name = "" ID.description = "" #print ID.id #print ID # write the normal fasta out SeqIO.write(ID, fasta_out_file, "fasta") # translate cDNA and write AA fasta aa_seq = [] coding_dna = ID.seq #print coding_dna for codon in grouper(coding_dna, 3): cog = "".join(codon) if cog == "---": aa_seq.append("-") else: cog_aa = translate(cog) aa_seq.append(cog_aa) aa_seq = "".join(aa_seq) ID = SeqRecord(Seq(aa_seq, IUPAC.protein), id = ID.id, name = ID.name) ID.description = "" SeqIO.write(ID, fasta_AA_out_file, "fasta") fasta_out_file.close() fasta_AA_out_file.close() phy_out_dir = path.join(outdir, "phylip") check_dir(phy_out_dir) phy_AA_out_dir = path.join(outdir, "phylip_AA") check_dir(phy_AA_out_dir) phy_out_subdir = path.join(phy_out_dir, prefix) check_dir(phy_out_subdir) phy_out_file_path = path.join(phy_out_subdir, "".join(basename + ".phy")) phy_AA_out_subdir = path.join(phy_AA_out_dir, prefix) check_dir(phy_AA_out_subdir) phy_AA_out_file_path = path.join(phy_AA_out_subdir, "".join(basename + ".phy")) fasta_alignment = open(fasta_out_file_path, "rU") fasta_AA_alignment = open(fasta_AA_out_file_path, "rU") phy_out_file = open(phy_out_file_path, "w") phy_AA_out_file = open(phy_AA_out_file_path, "w") alignments = AlignIO.parse(fasta_alignment, "fasta") AlignIO.write(alignments, phy_out_file, "phylip-relaxed") fasta_alignment.close() phy_out_file.close() alignments_AA = AlignIO.parse(fasta_AA_alignment, "fasta") AlignIO.write(alignments_AA, phy_AA_out_file, "phylip-relaxed") fasta_AA_alignment.close() phy_AA_out_file.close()
def main(config, rank, world_size, gpu_id, port, kwargs): torch.backends.cudnn.benchmark = True conf = parse_config_or_kwargs(config, **kwargs) # --------- multi machine train set up -------------- if conf['train_local'] == 1: host_addr = 'localhost' conf['rank'] = rank conf['local_rank'] = gpu_id # specify the local gpu id conf['world_size'] = world_size dist_init(host_addr, conf['rank'], conf['local_rank'], conf['world_size'], port) else: host_addr = getoneNode() conf['rank'] = int(os.environ['SLURM_PROCID']) conf['local_rank'] = int(os.environ['SLURM_LOCALID']) conf['world_size'] = int(os.environ['SLURM_NTASKS']) dist_init(host_addr, conf['rank'], conf['local_rank'], conf['world_size'], '2' + os.environ['SLURM_JOBID'][-4:]) gpu_id = conf['local_rank'] # --------- multi machine train set up -------------- # setup logger if conf['rank'] == 0: check_dir(conf['exp_dir']) logger = get_logger_2(os.path.join(conf['exp_dir'], 'train.log'), "[ %(asctime)s ] %(message)s") dist.barrier() # let the rank 0 mkdir first if conf['rank'] != 0: logger = get_logger_2(os.path.join(conf['exp_dir'], 'train.log'), "[ %(asctime)s ] %(message)s") logger.info("Rank: {}/{}, local rank:{} is running".format( conf['rank'], conf['world_size'], conf['rank'])) # write the config file to the exp_dir if conf['rank'] == 0: store_path = os.path.join(conf['exp_dir'], 'config.yaml') store_yaml(config, store_path, **kwargs) cuda_id = 'cuda:' + str(gpu_id) conf['device'] = torch.device( cuda_id if torch.cuda.is_available() else 'cpu') model_dir = os.path.join(conf['exp_dir'], 'models') if conf['rank'] == 0: check_dir(model_dir) conf['checkpoint_format'] = os.path.join(model_dir, '{}.th') set_seed(666 + conf['rank']) if 'R' in conf['model_type']: model = eval(conf['model_type'])(base_ch_num=conf['base_ch_num'], t=conf['t']) else: model = eval(conf['model_type'])(base_ch_num=conf['base_ch_num']) model = model.to(conf['device']) model = DDP(model, device_ids=[conf['local_rank']], output_device=conf['local_rank']) optimizer = optim.Adam(model.parameters(), lr=conf['lr'], betas=(0.5, 0.99)) if conf['rank'] == 0: num_params = sum(param.numel() for param in model.parameters()) logger.info("Model type: {} Base channel num:{}".format( conf['model_type'], conf['base_ch_num'])) logger.info("Number of parameters: {:.4f}M".format(1.0 * num_params / 1e6)) logger.info(optimizer) train_set = ImageFolder(root=conf['root'], mode='train', augmentation_prob=conf['aug_prob'], crop_size_min=conf['crop_size_min'], crop_size_max=conf['crop_size_max'], data_num=conf['data_num'], gauss_size=conf['gauss_size'], data_aug_list=conf['aug_list']) train_loader = DataLoader(dataset=train_set, batch_size=conf['batch_size'], shuffle=conf['shuffle'], num_workers=conf['num_workers']) dev_set = ImageFolder(root=conf['root'], mode='train', augmentation_prob=0.0) dev_loader = DataLoader(dataset=dev_set, batch_size=5, shuffle=False, num_workers=1) valid_set = ImageFolder(root=conf['root'], mode='valid') valid_loader = DataLoader(dataset=valid_set, batch_size=5, shuffle=False, num_workers=1) test_set = ImageFolder(root=conf['root'], mode='test') test_loader = DataLoader(dataset=test_set, batch_size=5, shuffle=False, num_workers=1) dist.barrier() # synchronize here train(model, train_loader, test_loader, dev_loader, optimizer, conf, logger)
def parse_args(): desc = 'TensorFlow 2.0 implementation of Deep Convolutional Generative Adversarial Network (DCGAN)' parser = argparse.ArgumentParser(description=desc) parser.add_argument('--dataset_name', type=str, default='celeba') parser.add_argument('--phase', type=str, default='tfrecord', choices=('tfrecord', 'train', 'test')) parser.add_argument('--img_size', type=int, default=64) parser.add_argument('--img_nc', type=int, default=3) parser.add_argument('--z_dim', type=int, default=100) parser.add_argument('--batch_size', type=int, default=100) parser.add_argument('--iteration', type=int, default=100000) parser.add_argument('--log_freq', type=int, default=1000) parser.add_argument('--sample_freq', type=int, default=1000) parser.add_argument('--save_freq', type=int, default=10000) parser.add_argument('--output_dir', type=str, default='output') parser.add_argument('--log_dir', type=str, default='log') parser.add_argument('--sample_dir', type=str, default='sample') parser.add_argument('--save_dir', type=str, default='model') parser.add_argument('--result_dir', type=str, default='result') parser.add_argument('--lr', type=float, default=0.0002) parser.add_argument('--gan_type', type=str, default='vanilla', choices=('vanilla', 'lsgan', 'hinge')) args = parser.parse_args() check_dir(args.output_dir) args.output_dir = os.path.join(args.output_dir, f'DCGAN_{args.dataset_name}') check_dir(args.output_dir) args.log_dir = os.path.join(args.output_dir, args.log_dir) check_dir(args.log_dir) args.sample_dir = os.path.join(args.output_dir, args.sample_dir) check_dir(args.sample_dir) args.save_dir = os.path.join(args.output_dir, args.save_dir) check_dir(args.save_dir) args.result_dir = os.path.join(args.output_dir, args.result_dir) check_dir(args.result_dir) return args
def process(self, image, output_path=None, output_name=None): image_origin = image check_dir(output_path) cv2.imwrite(join(output_path, output_name + '.origin.png'), image_origin) # image_name = # image_origin = cv2.imread(image_path) image_origin_height, image_origin_width = image_origin.shape[0:2] # print('Width', image_origin_width, 'Height', image_origin_height) image_crop, image_edge = format_image_rgb(image_origin) cv2.imwrite(join(output_path, output_name + '.landmark.crop.png'), image_crop) # print('Image Data', image_crop、, 'Image Edge', image_edge) image_crop_resize = cv2.resize(image_crop, (128, 128)) cv2.imwrite(join(output_path, output_name + '.landmark.resize.png'), image_crop_resize) # image_data = cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB) # print('Image', image_crop_resize) predictions = self.sess.run(self.landmark_logits, feed_dict={self.inputs: image_crop_resize}) # print(predictions) # print('Len predictions', predictions) marks = np.array(predictions).flatten() marks = np.reshape(marks, (-1, 2)) # print(marks) # width = # print('Image edge shape', image_edge) # to do multiply marks *= (image_edge[2] - image_edge[0]) marks[:, 0] += image_edge[0] marks[:, 1] += image_edge[1] # print(marks) with open(join(output_path, output_name + '.marks.txt'), 'w', encoding='utf-8') as f: f.write(json.dumps(marks.tolist())) for mark in marks: cv2.circle(image_origin, tuple(mark), 3, (255, 0, 0)) cv2.imwrite(join(output_path, output_name + '.landmark.png'), image_origin) pose_estimator = PoseEstimator(img_size=(image_origin_height, image_origin_width)) # pose_estimator pose = pose_estimator.solve_pose_by_68_points(marks) print('Pose', pose) with open(join(output_path, output_name + '.pose.txt'), 'w', encoding='utf-8') as f: f.write(json.dumps(pose)) return pose
from utils import check_dir gpu_id = 0 batch_size = 1000 torch.cuda.set_device(gpu_id) parser = InitParser() net = Stack_Bi_LSTM(parser) weights_path = "../output/fine_tuning_result/Network_fine_tuning.pth.gz" weights = torch.load(weights_path,map_location='cuda:%d'%(gpu_id)) #GPU ### net.load_state_dict(weights) net.cuda().eval() #测试模式 GPU all_data_set = MySet(parser.fine_tuning_txt_path , mode="all") all_data_loader = DataLoader(all_data_set, batch_size=batch_size, shuffle=True) output_path = '../LSTM_feature' check_dir(output_path) with torch.no_grad(): for batch_idx, (sequence, label, name) in enumerate(all_data_loader): sequence = sequence.float().cuda() #GPU label = label.data.numpy() predict,feature = net(sequence.permute(1,0,2)) feature = feature.data.cpu().numpy() for i in range(feature.shape[0]): savemat(os.path.join(output_path,re.search('[a-z]*_[0-9]*.mat',name[i]).group()),{'temp':feature[i,:]})
nKnovel=opt.test_way, nKbase=0, nExemplars=opt.val_shot, # num training examples per novel category nTestNovel=opt.val_query * opt.test_way, # num test examples for all the novel categories nTestBase=0, # num test examples for all the base categories batch_size=1, num_workers=0, epoch_size=1 * opt.val_episode, # num of batches per epoch ) if opt.aws == 1: set_gpu(opt.gpu) # check_dir('./experiments/') check_dir(opt.save_path) check_dir(opt.tensorboard_dir) # debug the GPU part # print("Device Count: ", torch.cuda.device_count()) # print("Dev 1: ", torch.cuda.get_device_name(0)) # print("Dev 2: ", torch.cuda.get_device_name(1)) # print("Dev 3: ", torch.cuda.get_device_name(2)) # print("Dev 4: ", torch.cuda.get_device_name(3)) log_file_path = os.path.join(opt.save_path, "train_log.txt") print(log_file_path) log(log_file_path, str(vars(opt)))
import utils from utils import check_dir fubar_cmd = "~sparks/hyphy-hyphyqt/HYPHYMP {0}" argparser = argparse.ArgumentParser() argparser.add_argument('--inroot', metavar='input_root', type=str, required=True) argparser.add_argument('--logdir', metavar='log_dir', type=str, required=True) args = argparser.parse_args() inroot = args.inroot logroot = args.logdir utils.check_dir(logroot) sizes = "Small", "Medium", "Big" species_numbers = "6species", "12species", "17species", "44species" # prepare each of the 12 directories with sequences for slr for species in species_numbers: print species check_dir(path.join(inroot,species)) check_dir(path.join(logroot, species)) for size in sizes: print size check_dir(path.join(inroot, species, size)) check_dir(path.join(logroot, species, size))
argparser.add_argument("--outdir", metavar="Output Dir + working dir + logdir", type=str, required=True) argparser.add_argument("--gpf", metavar="template Dir", type=str, required=True) args = argparser.parse_args() treedir = args.treedir outdir = args.outdir gpf = args.gpf ####### Prep ########### sizes = "Small", "Medium", "Big" species_numbers = "6species", "12species", "17species", "44species" check_dir(outdir) os.chdir(outdir) # extract information out of the gpf (gideon pomeranz file) parameters = open(gpf).read() m = re.search("(?<=n_sites=)\w+", parameters) n_sites = m.group(0) n_sites = int(n_sites) m = re.search("(?<=n_runs=)\w+", parameters) n_runs = m.group(0) n_runs = int(n_runs) m = re.search("(?<=alphas=).+", parameters) alphas = m.group(0) alphas = alphas.split(",")
inputRedirect["04"]="%s"; inputRedirect["05"]="20"; inputRedirect["06"]="5"; inputRedirect["07"]="2000000"; inputRedirect["08"]="1000000"; inputRedirect["09"]="100"; inputRedirect["10"]="0.5"; ExecuteAFile ("/nfs/research2/goldman/gregs/HBL/FUBAR/FUBAR.bf", inputRedirect); """ argparser = argparse.ArgumentParser() argparser.add_argument('--indir', metavar='input_directory', type=str, required=True) argparser.add_argument('--outdir', metavar='input_directory', type=str, required=True) argparser.add_argument('--clade', metavar='input_directory', type=str, required=True) args = argparser.parse_args() utils.check_dir(args.outdir) utils.check_dir(path.join(args.outdir, args.clade)) def read_slr(fh): stats = fh.readline() seqs = [] for l in utils.grouper(fh, 2): name = l[0].rstrip() seq = l[1].rstrip() seqs.append(SeqRecord(id=name, seq=Seq(seq), description="")) return seqs for f in glob.glob(path.join(args.indir, args.clade, '*', '*_slr.paml')):
def __set_cfg(self, cfgdir): '''Sets the config files and checks the directory''' cfgdir = os.path.join(cfgdir, self.name) utils.check_dir(cfgdir) self.sample_cfg = os.path.join(cfgdir, '{0.name}.cfg'.format(self)) self.report_yaml = os.path.join(cfgdir, '{0.name}.build_reporter.yaml'.format(self))
def test_lstm(**kwargs): """ Wrapper function for training and testing LSTM :type fold: int :param fold: fold index of the ATIS dataset, from 0 to 4. :type lr: float :param lr: learning rate used (factor for the stochastic gradient). :type nepochs: int :param nepochs: maximal number of epochs to run the optimizer. :type win: int :param win: number of words in the context window. :type nhidden: int :param n_hidden: number of hidden units. :type emb_dimension: int :param emb_dimension: dimension of word embedding. :type verbose: boolean :param verbose: to print out epoch summary or not to. :type decay: boolean :param decay: decay on the learning rate if improvement stop. :type savemodel: boolean :param savemodel: save the trained model or not. :type normal: boolean :param normal: normalize word embeddings after each update or not. :type folder: string :param folder: path to the folder where results will be stored. """ # process input arguments param = { 'experiment': 'standard', 'lr': 0.1, 'verbose': True, 'decay': True, 'win': 3, 'nhidden': 300, 'nhidden2': 300, 'seed': 345, 'emb_dimension': 90, 'nepochs': 40, 'savemodel': False, 'normal': True, 'layer_norm': False, 'minibatch_size': 4978, 'folder': '../result' } param_diff = set(kwargs.keys()) - set(param.keys()) if param_diff: raise KeyError("invalid arguments:" + str(tuple(param_diff))) param.update(kwargs) if param['verbose']: for k, v in param.items(): print("%s: %s" % (k, v)) # create result folder if not exists check_dir(param['folder']) # load the dataset print('... loading the dataset') train_set, valid_set, test_set, dic = load_data(3) train_set = list(train_set) valid_set = list(valid_set) # Add validation set to train set for i in range(3): train_set[i] += valid_set[i] # create mapping from index to label, and index to word idx2label = dict((k, v) for v, k in dic['labels2idx'].items()) idx2word = dict((k, v) for v, k in dic['words2idx'].items()) # unpack dataset train_lex, train_ne, train_y = train_set test_lex, test_ne, test_y = test_set n_trainbatches = len(train_lex) // param['minibatch_size'] print("Sentences in train: %d, Words in train: %d" % (count_of_words_and_sentences(train_lex))) print("Sentences in test: %d, Words in test: %d" % (count_of_words_and_sentences(test_lex))) vocsize = len(dic['words2idx']) nclasses = len(dic['labels2idx']) nsentences = len(train_lex) groundtruth_test = [[idx2label[x] for x in y] for y in test_y] words_test = [[idx2word[x] for x in w] for w in test_lex] # instanciate the model numpy.random.seed(param['seed']) random.seed(param['seed']) print('... building the model') lstm = LSTM(n_hidden=param['nhidden'], n_hidden2=param['nhidden2'], n_out=nclasses, n_emb=vocsize, dim_emb=param['emb_dimension'], cwind_size=param['win'], normal=param['normal'], layer_norm=param['layer_norm'], experiment=param['experiment']) # train with early stopping on validation set print('... training') best_f1 = -numpy.inf param['clr'] = param['lr'] for e in range(param['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], param['seed']) param['ce'] = e tic = timeit.default_timer() for minibatch_index in range(n_trainbatches): for i in range(minibatch_index * param['minibatch_size'], (1 + minibatch_index) * param['minibatch_size']): x = train_lex[i] y = train_y[i] res = lstm.train(x, y, param['win'], param['clr']) predictions_test = [[ idx2label[x] for x in lstm.classify( numpy.asarray(contextwin(x, param['win'])).astype('int32')) ] for x in test_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, param['folder'] + '/current.test.txt', param['folder']) if res_test['f1'] > best_f1: if param['savemodel']: lstm.save(param['folder']) best_lstm = copy.deepcopy(lstm) best_f1 = res_test['f1'] if param['verbose']: print( 'NEW BEST: epoch %d, minibatch %d/%d, best test F1: %.3f' % (e, minibatch_index + 1, n_trainbatches, res_test['f1'])) param['tf1'] = res_test['f1'] param['tp'] = res_test['p'] param['tr'] = res_test['r'] param['be'] = e os.rename(param['folder'] + '/current.test.txt', param['folder'] + '/best.test.txt') else: if param['verbose']: print('') # learning rate decay if no improvement in 10 epochs if param['decay'] and abs(param['be'] - param['ce']) >= 10: param['clr'] *= 0.5 print("Decay happened. New Learning Rate:", param['clr']) lstm = best_lstm if param['clr'] < 0.00001: break print('BEST RESULT: epoch', param['be'], 'best test F1', param['tf1'], 'with the model', param['folder']) return lstm, dic
def __init__(self, queue, step, pid, voc_size, valid_data_flow): self.queue = queue self.valid_data_flow = valid_data_flow threading.Thread.__init__(self) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.train_in_seq = tf.placeholder(tf.int32, shape=[dataflow.batch_size, None], name='in_seq') self.train_in_seq_len = tf.placeholder(tf.int32, shape=[dataflow.batch_size], name='in_seq_len') self.train_target_seq = tf.placeholder( tf.int32, shape=[dataflow.batch_size, None], name='target_seq') self.train_target_seq_len = tf.placeholder(tf.int32, shape=[dataflow.batch_size], name='target_seq_len') self.learning_rate = tf.placeholder(tf.float32, name='learning_rate') self.model = model.Seq2Seq() self.model.build(self.train_in_seq, self.train_in_seq_len, self.train_target_seq, self.train_target_seq_len, voc_size, dataflow.hidden_unit, dataflow.layers, dataflow.dropout, dataflow.learning_rate, name_scope='train') self.model.build_infer(self.train_in_seq, self.train_in_seq_len, voc_size, dataflow.hidden_unit, dataflow.layers, name_scope='infer') self.transfer = model.transfer_params(from_scope='train', to_sope='infer') self.sess.run(tf.global_variables_initializer()) self.saver = Saver(self.sess) if start_step == 1: continue_train = False else: continue_train = True self.saver.auto_save_init(save_dir=dataflow.lstm_save_dir, save_interval=saveTime, max_keep=5, scope_name='train', continue_train=continue_train) self.saver.load(dataflow.init_path, scope_name='train', del_scope=True) print('Training Begin') self.step = step print('pid:{}'.format(pid)) self.pid = pid if start_step == 1: if check_dir(dataflow.lstm_log_dir): del_dir_under(dataflow.lstm_log_dir) else: create_dir(dataflow.lstm_log_dir) else: assert check_dir(dataflow.lstm_log_dir) self.writer = tf.summary.FileWriter(dataflow.lstm_log_dir, self.sess.graph)
from draw import draw_all from utils import check_dir from utils import clean_temp_files from utils import copy_resources_to_processed from get_data_from_log_file import get_data_from_log_file if __name__ == '__main__': model_name = 'densenet121-train90-lr0.1-batch768' # model_name = 'restnet18-train90-lr0.1' check_dir() get_data_from_log_file(log_file='%s.txt' % model_name) draw_all() copy_resources_to_processed(model_name=model_name, file_name='modified', replace=True) copy_resources_to_processed(model_name=model_name, file_name='plt', replace=True) # if you want to clean temp file under postprocess, use this clean_temp_files() print("All Done!") print("you can check resources in '../processed'") print("path in project is 'data/postprocess'")
argparser.add_argument("--alignroot", metavar="Alignment Dir", type=str, required=True) argparser.add_argument("--treeroot", metavar="Tree Dir", type=str, required=True) argparser.add_argument("--outroot", metavar="Output Dir + working dir + logdir", type=str, required=True) args = argparser.parse_args() alignroot = args.alignroot treeroot = args.treeroot outroot = args.outroot # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Create log directory and out directory check_dir(outroot) out_pre_dir = path.join(outroot, "out") check_dir(out_pre_dir) logdir = path.join(outroot, "logs") check_dir(logdir) if os.getcwd() != outroot: os.chdir(outroot) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # define the paml command codeml_cmd = "python /nfs/research2/goldman/pomeranz/tree_stats/Scripts/analyse_codeml.py --alignfile {0} --treefile {1} --template_dir {2} --outfile {3} --workdir {4}" # start the loop
parser.add_argument('--outer_lr', type=float, default=1e-3) parser.add_argument('--outer_opt', type=str, default='Adam') parser.add_argument('--lr_sched', type=lambda x: (str(x).lower() == 'true'), default=False) # network settings parser.add_argument('--net', type=str, default='ConvNet') parser.add_argument('--n_conv', type=int, default=4) parser.add_argument('--n_dense', type=int, default=0) parser.add_argument('--hidden_dim', type=int, default=64) parser.add_argument('--in_channels', type=int, default=3) parser.add_argument( '--hidden_channels', type=int, default=64, help='Number of channels for each convolutional layer (default: 64).') args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() set_seed(args.seed) set_gpu(args.device) check_dir(args) set_logger(os.path.join('logs/', args.exp + '.txt'), log_console=False) t1_start = process_time() main(args) t1_stop = process_time() logging.info('Elapsed time = {}'.format(t1_stop - t1_start))
ens_cdna_dir = args.cds for f in glob(path.join(ens_cdna_dir, '*.fa')): print "Processing", f for seqr in SeqIO.parse(f, 'fasta'): if seqr.id in ens_map: print "Duplicate id", seqr.id sys.exit(-1) ens_map[t2p[seqr.id]] = seqr.seq clades_pickle = args.species_cache clades = pickle.load(open(clades_pickle)) inroot = args.inroot outroot = args.outroot utils.check_dir(outroot) for seqset in glob(path.join(inroot, args.clade, "*", "*.tab")): setid = path.basename(seqset).rpartition('.')[0] seqs = [] utils.check_dir(path.join(outroot, args.clade)) for l in open(seqset): seqid, species = l.rstrip().split('\t') # if species not in all_species: # continue # TODO The completely honest thing to do would be to check if the genes # with missing sequences fall into the relevant clade or not seq = ens_map.get(seqid) if seq is None:
data[item] = 0 data.loc[:, [comConfig.col_has_link]] = data[comConfig.col_content].map(lambda x: whe_link(x)) data.loc[:, [comConfig.col_has_title]] = data[comConfig.col_content].map(lambda x: whe_title(x)) data.loc[:, [comConfig.col_has_emoj]] = data[comConfig.col_content].map(lambda x: whe_emoji(x)) data.loc[:, [comConfig.col_has_at]] = data[comConfig.col_content].map(lambda x: whe_art(x)) data.loc[:, [comConfig.col_text_len]] = data[comConfig.col_content].map(lambda x: get_Length(x)) return data if __name__ == '__main__': fe = feature_extraction() train_data = fe.read_train_data(fileConfig.csv_dir + fileConfig.file_train_pandas) # predict_data = fe.read_predict_data(fileConfig.data_dir + fileConfig.file_weibo_predict_data) test_data = fe.read_test_data(fileConfig.csv_dir + fileConfig.file_test_pandas) user_dict = fe.get_Dict(train_data) mblog_dict = utils.pickle_load(fileConfig.pickle_dir + fileConfig.file_train_mblog_dict_pkl) # print(dict) print("start create train feature...") train_data_updated = fe.build_feature(train_data, user_dict, mblog_dict) print("start create test feature...") # predict_data_updated = fe.build_feature(predict_data, dict) test_data_updated = fe.build_feature(test_data, user_dict, mblog_dict) # dict_dataframe=pd.DataFrame(dict).T # dict_dataframe.columns=['总数量','总转发','总评论','总赞'] # dict_dataframe.to_csv('dict_pandas.csv') utils.check_dir(fileConfig.csv_dir) train_data_updated.to_csv(fileConfig.csv_dir + fileConfig.file_fe_train) # predict_data_updated.to_csv(fileConfig.csv_dir + fileConfig.file_fe_predict) test_data_updated.to_csv(fileConfig.csv_dir + fileConfig.file_fe_test)
argparser.add_argument('--mode', metavar='mode', type=str, required=False, default='codon') argparser.add_argument('--rerun', action='store_true') args = argparser.parse_args() if args.treeroot: prank_cmd = "prank -d={0} -t={1} -o={2} -prunetree -" + args.mode else: prank_cmd = "prank -d={0} -o={1} -prunetree -codon" + args.mode inroot = args.inroot treeroot = args.treeroot alndir = args.outroot logroot = args.logdir utils.check_dir(logroot) utils.check_dir(path.join(logroot, args.clade)) utils.check_dir(alndir) utils.check_dir(path.join(alndir, args.clade)) for infile in glob.glob(path.join(inroot, args.clade, "*", "*.fa")): print infile basename = path.basename(infile).partition('.')[0] prefix = basename.partition('_')[0][:2] outdir = path.join(alndir, args.clade, prefix) utils.check_dir(outdir) outfile = path.join(outdir, basename + '_prank') logdir = path.join(logroot, args.clade, prefix)
def format_trees(treeroot, fastaroot, outroot): fastafiles = path.join(fastaroot, "*/*.fa") if not os.path.exists(outroot): os.makedirs(outroot) rooted_out_dir = path.join(outroot, "rooted") check_dir(rooted_out_dir) unrooted_out_dir = path.join(outroot, "unrooted") check_dir(unrooted_out_dir) for infile in glob(fastafiles): print infile basename = path.basename(infile).partition('.')[0] basename = "".join(basename.split("_")[0] + "_" + basename.split("_")[1]) prefix = basename.partition('_')[0][:2] fastafile = infile treedir = path.join(treeroot, prefix) treefile = path.join(treedir, basename + '.nh') # make the tree object tree = Tree(newick=treefile) # loop that deletes nodes that are not in the alignment for leaf_name in tree.iter_leaf_names(): name_check = [] for ID in SeqIO.parse(fastafile, "fasta"): if ID.id in leaf_name: name_check.append(True) else: name_check.append(False) if any(name_check): continue else: leaf = tree.search_nodes(name=leaf_name)[0] leaf.delete() #node = leaf.up #node.remove_child(leaf) # create the directories for rooted trees rooted_out_sub_dir = path.join(rooted_out_dir, prefix) check_dir(rooted_out_sub_dir) rooted_out_file = path.join(rooted_out_sub_dir, basename + ".nh") tree.write(outfile=rooted_out_file, format=6) # create subdirectories for unrooted trees unrooted_out_sub_dir = path.join(unrooted_out_dir, prefix) check_dir(unrooted_out_sub_dir) unrooted_out_file = path.join(unrooted_out_sub_dir, basename + ".nh") # unroot the tree tree.unroot() tree.write(outfile=unrooted_out_file, format=6)
output_results.append(output_text) if _ % 100 == 0 and i == 0: print('====================') input_text = decode_text(in_seq[i], self.valid_data_flow.vocabs) print('src:' + input_text) print('output: ' + ' '.join(output_text)) print('target: ' + ' '.join(target_text)) return bleu.compute_bleu(target_results, output_results)[0] * 100 if __name__ == '__main__': pid = os.getpid() if not check_dir(dataflow.lstm_save_dir): create_dir(dataflow.lstm_save_dir) print('loading training data...') train_data_flow = dataflow.DataFlow(dataflow.batch_size, data_dir=dataflow.data_path + 'train/') print('loading evaluation data...') valid_data_flow = dataflow.DataFlow(dataflow.batch_size, data_dir=dataflow.data_path + 'test/', shuffle=True) q = queue.Queue(maxsize=100) pt = Producter(queue=q, data_loader=train_data_flow, step=maxstep) ce = Consumer(step=maxstep, queue=q,
def print_file(arg_namespace): _arg = arg_namespace printer = print_json encoding = _arg.encoding target = _arg.target dataType = _arg.dataType start = _arg.start end = _arg.end if _arg.end else start filename_time = _arg.filename_time filetype = 'json' interval_time = _arg.interval_time if target=='local': localType = _arg.localType else: localType = None if _arg.directory: datadir = _arg.directory else: if (target=='local' and localType): datadir = './crawled_data/%s-%s/%s' % (target, localType, dataType) else: datadir = './crawled_data/%s/%s' % (target, dataType) time_string = datetime.today().strftime("%Y%m%d%H%M%S") check_dir(datadir) jobs = [] if target=='local': if filename_time: for n in range(start, end+1): filename = '%s/%s-%s-%s-%d-%s.%s'\ % (datadir, target, localType, dataType, n, time_string, filetype) job = gevent.spawn(crawl, target=target, localType=localType,\ _dataType=dataType, nth=n, filename=filename, encoding=encoding, printer=printer) jobs.append(job) else: for n in range(start, end+1): filename = '%s/%s-%s-%s-%d.%s'\ % (datadir, target, localType, dataType, n, filetype) job = gevent.spawn(crawl, target=target, localType=localType,\ _dataType=dataType, nth=n, filename=filename, encoding=encoding, printer=printer) jobs.append(job) else: if filename_time: for n in range(start, end+1): filename = '%s/%s-%s-%d-%s.%s'\ % (datadir, target, dataType, n, time_string, filetype) job = gevent.spawn(crawl, target=target, _dataType=dataType, nth=n,\ filename=filename, encoding=encoding, printer=printer) jobs.append(job) else: for n in range(start, end+1): filename = '%s/%s-%s-%d.%s'\ % (datadir, target, dataType, n, filetype) job = gevent.spawn(crawl, target=target, _dataType=dataType, nth=n,\ filename=filename, encoding=encoding, printer=printer) jobs.append(job) gevent.joinall(jobs) print('Data written to %s' % filename) if interval_time!=0 and interval_time!=None: s = sched.scheduler(time.time, time.sleep) print('The program will crawl the next data within %d seconds.' % interval_time) s.enter(interval_time, 1, print_file, kwargs=dict(arg_namespace=_arg)) s.run()