def uniq_pos(f1,f2): nn=0 nn1=0 nn2=0 outff=open(f2,"w") pos10="" pos20="" for line in fileinput.input(f1): nn+=1 l=line.split() pos=l[0] sign=pos[4] if sign=="+": if pos!=pos10: nn1+=1 outff.write("%s"%(line)) pos10=pos elif sign=="-": if pos!=pos20: nn2+=1 outff.write("%s"%(line)) pos20=pos fileinput.close() outff.close() print "-- sorted : %d"%(nn); print "-- uniq+sorted : %d"%(nn1); print "-- uniq-sorted : %d"%(nn2); print "-- uniq_sorted : %d"%(nn1+nn2);
def extractSrcFileData(self, path): fileinput.close() isLocListener = False wakeLockAcqRegex = "invoke-virtual(.*?)Landroid/os/PowerManager$WakeLock;->acquire()" domRegex = "invoke-virtual(.*?)Ljavax/xml/parsers/DocumentBuilderFactory;->newDocumentBuilder()" saxRegex = "invoke-virtual(.*?)Ljavax/xml/parsers/SAXParserFactory;->newSAXParser()" xmlppRegex = "invoke-static(.*?)Landroid/util/Xml;->newPullParser()" for line in fileinput.input([path]): matches = re.findall(wakeLockAcqRegex, line) if len(matches) > 0: self.numNoTimeoutWakeLocks = self.numNoTimeoutWakeLocks + 1 if line.startswith(".implements Landroid/location/LocationListener;"): self.numLocListeners = self.numLocListeners + 1 isLocListener = True if isLocListener: if "\"gps\"" in line: self.numGpsUses = self. numGpsUses + 1 matches = re.findall(domRegex, line) if len(matches) > 0: self.numDomParser = self.numDomParser + 1 matches = re.findall(saxRegex, line) if len(matches) > 0: self.numSaxParser = self.numSaxParser + 1 matches = re.findall(xmlppRegex, line) if len(matches) > 0: self.numXMLPullParser = self.numXMLPullParser + 1
def parse(self, file_location): """Loads Kv data into memory Args: file_location (str): Path of file Returns: dict: name : numpy array of events """ #This loop both sets the first_line variable, and finds the number of lines in the file for file_length, line in enumerate(fileinput.input(file_location)): if file_length == 0: first_line = line fileinput.close() parsed = {} for x in range(len(first_line.split(","))): parsed[first_line.split(",")[x].split("=")[0]] = numpy.zeros(shape=file_length+1, dtype="float64") for index, line in enumerate(fileinput.input(file_location)): the_line = line.strip("\n") for particle_count in range(len(line.split(","))): parsed[the_line.split(",")[particle_count].split("=")[0]][index] = numpy.float64(the_line.split(",")[particle_count].split("=")[1]) fileinput.close() return parsed
def processDir(dir_proc): for file in os.listdir(dir_proc): if os.path.isdir(os.path.join(dir_proc, file)): print "WARN:%s is a directory" %(file) processDir(os.path.join(dir_proc, file)) continue if not file.endswith(".log"): print "WARN:%s is not a log file" %(file) continue print "INFO:process file %s" %(file) for line in fileinput.input(os.path.join(dir_proc, file)): matchs = nginxLogPattern.match(line) if matchs!=None: allGroups = matchs.groups() ip = allGroups[0] time = allGroups[1] request = allGroups[2] status = allGroups[3] bodyBytesSent = allGroups[4] refer = allGroups[5] # userAgent = allGroups[6] userAgent = matchs.group("userAgent") print userAgent #统计HTTP状态码的数量 GetResponseStatusCount(userAgent) #在这里补充其他任何需要的分析代码 else: raise Exception fileinput.close()
def configuring_nodejs_app(self, git_repo, random_string): try: file_name = git_repo + "/server.js" for line in fileinput.input(file_name, inplace = True): match = re.search(r"res.send\(self.cache_get\('index.html.*", line) if match: print 'res.send("<html><head></head><body><p>%s</p></body></html>");' % ( random_string ) else: print line, except Exception as e: fileinput.close() print type(e) print e.args return 1 finally: fileinput.close() deployment_steps = [ "cd %s" % ( git_repo ), "git commit -a -m 'Added special handler for /'", "git push" ] ( ret_code, ret_output ) = common.command_getstatusoutput(" && ".join(deployment_steps)) print ret_output return ret_code
def _read_multi_column_list(self, list_file): rows = [] if not os.path.isfile(list_file): raise RuntimeError('File %s does not exist.' % (list_file,)) try: for line in fileinput.input(list_file): if line.strip().startswith('#'): continue parsed_line = re.findall('[\w/(-.)]+', line) if len(parsed_line): # perform some sanity checks if len(parsed_line) not in (2, 3, 4): raise IOError("The read line '%s' from file '%s' could not be parsed successfully!" % ( line.rstrip(), list_file)) if len(rows) and len(rows[0]) != len(parsed_line): raise IOError( "The parsed line '%s' from file '%s' has a different number of elements than the first parsed line '%s'!" % ( parsed_line, list_file, rows[0])) # append the read line rows.append(parsed_line) fileinput.close() except IOError as e: raise RuntimeError("Error reading the file '%s' : '%s'." % (list_file, e)) # return the read list as a vector of columns return rows
def test_sort_big_file_numeric(self): join_fields = '0' sorter = mod.CSVSorter(self.dialect, join_fields, self.temp_dir, self.temp_dir) outfile = sorter.sort_file(self.fqfn) assert outfile == self.fqfn + '.sorted' for rec in fileinput.input(self.fqfn + '.sorted'): fields = rec.split(',') print(fields) if fileinput.lineno() == 1: assert fields[0] == '1' elif fileinput.lineno() == 2: assert fields[0] == '2' elif fileinput.lineno() == 3: assert fields[0] == '3' elif fileinput.lineno() == 4: assert fields[0] == '4' elif fileinput.lineno() == 5: assert fields[0] == '5' elif fileinput.lineno() == 6: assert fields[0] == '6' elif fileinput.lineno() == 7: assert fields[0] == '7' elif fileinput.lineno() == 8: assert fields[0] == '8' elif fileinput.lineno() == 9: assert fields[0] == '9' elif fileinput.lineno() == 10: assert fields[0] == '10' else: assert 0, 'too many rows returned' fileinput.close()
def remove_line(line_search, filepath): for line in fileinput.input(filepath, inplace=True): if line_search in line.strip(): continue else: print(line.rstrip("\n")) fileinput.close()
def test_full_single_file(self): """ Tests use of columns all against a single file. """ cmd = "%s %s -c '1,2' -d '|' -o %s " % (os.path.join(script_path, 'gristle_freaker'), self.easy_fqfn, self.out_fqfn) p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True, shell=True) records = p.communicate()[0] assert p.returncode == 0 #for record in records.split('\n'): # print record out_rec_cnt = 0 for rec in fileinput.input(self.out_fqfn): out_rec_cnt += 1 fields = rec[:-1].split('-') key_col_1 = fields[0].strip() key_col_2 = fields[1].strip() freq_cnt = int(fields[2]) assert key_col_1 == 'A' assert key_col_2 == 'B' assert freq_cnt == 100 fileinput.close() assert out_rec_cnt == 1 p.stdin.close()
def get(index): for line in fileinput.input(STACK_FILE): if fileinput.lineno() == index: fileinput.close() return line fileinput.close() raise RuntimeError("The index selected does not exist.")
def swap_lines(line_search_1, line_search_2, filepath): """ Swap lines in file, if line_search1 before line_search2 :param filepath: :param line_search_2: :param line_search_1: """ count = 0 count1 = 0 count2 = 0 for line in fileinput.input(filepath): count += 1 if line_search_1.strip() in line.strip(): count1 = count elif line_search_2.strip() in line.strip(): count2 = count fileinput.close() if 0 < count1 < count2: for line in fileinput.input(filepath, inplace=True): if line_search_1.strip() in line.strip(): print(line_search_2) elif line_search_2.strip() in line.strip(): print(line_search_1) else: print(line.rstrip("\n")) fileinput.close()
def upload(filename, dbname, session): blockcounter = 0 rowcounter = 0 requestdata = dict(new_edits=False,docs=[]) for line in fileinput.FileInput(filename): try: line = line.rstrip() if line[-1] == ',': line = line[:-1] bloated_doc = json.loads(line) if blockcounter >= config['blocksize']: #update db updatedb(dbname, requestdata, session) #reset the temp dict and counter requestdata = dict(new_edits=False,docs=[]) blockcounter = 0 #add row to temp dict requestdata['docs'].append(bloated_doc['doc']) #increment the row counter blockcounter += 1 except: if rowcounter != 0 and line != ']}': print 'An exception occured on line {0}'.format(rowcounter) finally: rowcounter += 1 fileinput.close() #write any remaining rows to the database updatedb(dbname, requestdata, session) print 'Database "{0}" uploading completed.'.format(dbname)
def load_dbpedia(data, database_1, database_2): for line in fileinput.input(data): e1, rel, e2, p = line.split() e1 = e1.split('<http://dbpedia.org/resource/')[1].replace(">", "") e2 = e2.split('<http://dbpedia.org/resource/')[1].replace(">", "") e1 = re.sub("_", " ", e1) e2 = re.sub("_", " ", e2) if "(" in e1 or "(" in e2: e1 = re.sub("\(.*\)", "", e1) e2 = re.sub("\(.*\)", "", e2) # store a tuple (entity1, entity2) in a dictionary database_1[(e1.strip(), e2.strip())].append(p) # store in a dictionary per relationship: dict['ent1'] = 'ent2' database_2[e1.strip()].append(e2.strip()) else: e1 = e1.decode("utf8").strip() e2 = e2.decode("utf8").strip() # store a tuple (entity1, entity2) in a dictionary database_1[(e1, e2)].append(p) # store in a dictionary per relationship: dict['ent1'] = 'ent2' database_2[e1.strip()].append(e2.strip()) fileinput.close() return database_1, database_2
def realbacktest(ticker="AAPL", start="2014-01-01", end="2015-10-23", duration=20, commission=2, file="test3"): global initialPrice try: startDate = datetime.strptime(start, "%Y-%m-%d") endDate = datetime.strptime(end, "%Y-%m-%d") except: print("wrong date format! Expect: %Y-%m-%d") return if (endDate - startDate).days < duration: print("duration larger than Duration") return stock = yahoo_finance.Share(ticker) stockHistory = stock.get_historical(start, end) stockHistory = stockHistory[::-1] realGain = getGains(stockHistory, duration, commission) fd = open(file, "wb") fd.write(str(realGain) + " net return, moving average\n") fd.write( str( (float(stockHistory[-1]["Adj_Close"]) - float(stockHistory[0]["Adj_Close"])) / float(stockHistory[0]["Adj_Close"]) ) + " buy and hold return\n" ) close()
def replaceInplace(f,searchExp,replaceExp): import fileinput for line in fileinput.input(f, inplace=1): if searchExp in line: line = line.replace(searchExp,replaceExp) sys.stdout.write(line) fileinput.close() # reported by jakob
def extractSrcFileData(self, path): fileinput.close() for line in fileinput.input([path]): matches = re.findall("invoke-virtual (.*?), Landroid/(.*?);->dismiss\(", line) if len(matches) > 0: self. dismiss = self.dismiss + 1 matches = re.findall("invoke-virtual (.*?), Landroid/(.*?);->show\(", line) if len(matches) > 0: self.show = self.show + 1 matches = re.findall("invoke-virtual (.*?), (.*?);->setContentView\(", line) if len(matches) > 0: self.setContentView = self.setContentView + 1 matches = re.findall("invoke-virtual (.*?), Landroid/(.*?);->createScaledBitmap\(", line) if len(matches) > 0: self. createScaledBitmap = self.createScaledBitmap + 1 matches = re.findall("invoke-virtual (.*?), (.*?);->onKeyDown\(", line) if len(matches) > 0: self.onKeyDown = self.onKeyDown + 1 matches = re.findall("invoke-virtual (.*?), Landroid/(.*?);->isPlaying\(", line) if len(matches) > 0: self.isPlaying = self.isPlaying + 1 matches = re.findall("invoke-virtual (.*?), (.*?);->unregisterReceiver\(", line) if len(matches) > 0: self.unregisterReceiver = self.unregisterReceiver + 1 matches = re.findall("invoke-virtual (.*?), (.*?);->onBackPressed\(", line) if len(matches) > 0: self. onBackPressed = self.onBackPressed + 1 matches = re.findall("invoke-virtual (.*?), (.*?);->showDialog\(", line) if len(matches) > 0: self.showDialog = self.showDialog + 1 matches = re.findall("invoke-virtual (.*?), Landroid/(.*?);->create\(", line) if len(matches) > 0: self.create = self.create + 1
def extract_weibo(_index_begin, _index_end): _file = open("weibo_place_weibo_{0}_{1}_extracted.txt".format(_index_begin, _index_end), "w") _index, _curr_poid, _curr_poid_index, _curr_poid_count, _time_begin, _time_end = 0, "", 0, 0, "", "" for line in fileinput.input("weibo_place_weibo_{0}_{1}.txt".format(_index_begin, _index_end)): # _index += 1 # print _index _poid = line.strip().split('\t')[0] _content = ' '.join(line.strip().split('\t')[2:]) try: _uid = re.findall(r'<a.*?class="card_content" alt="(.+?)">', _content)[0].strip() _text = re.sub(r'<.+?>','',re.findall(r'</a>:([\s\S]*?)</a>', _content)[0]).strip() _date = re.findall(r'<a.*?class="date">(.*?)</a>', line)[0].strip() _loc, _lng, _lat = re.findall(r'<a class="showmapbox" action-data="(.*?)\|(.*?),(.*?)\|0\|.*?">', _content)[0] _lng, _lat = round(float(_lng),3), round(float(_lat),3) _cord = "{0}|{1},{2}".format(_loc.strip(), _lng, _lat) _file.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(_poid,_date,_cord,_uid,_text)) # validation if _poid != _curr_poid: if _curr_poid != "": print _curr_poid_index, _curr_poid, _curr_poid_count, _time_begin, _time_end _curr_poid, _curr_poid_index, _curr_poid_count, _time_begin = _poid, _curr_poid_index+1, 0, _date else: _time_end, _curr_poid_count = _date, _curr_poid_count+1 except: continue fileinput.close() _file.close()
def load_img_list(img_list_file): global dc gc.disable() count = 0 for line in fileinput.input(img_list_file): img1_file, img2_file, ofx_file, ofy_file = line.strip().split(' ') #print "[",count,"] ", img1_file, img2_file, ofx_file, ofy_file if img1_file not in dc: #id, key = get_img_key(img1_file, False) imgs.append(img1_file) #keys.append(key) dc[img1_file] = 1 if img2_file not in dc: #id, key = get_img_key(img2_file, False) imgs.append(img2_file) #keys.append(key) dc[img2_file] = 1 count += 1 if max_images > 0 and count > max_images: break #string_ = str(block_first + in_idx + 1) + ' / ' + str(len(img_list)) if (count % 1000) == 0: sys.stdout.write("\r%d" % count) sys.stdout.flush() gc.enable() fileinput.close() print return
def __load_obj_list(self): """Read list of image files from XML metadata file. We assume the order of file names to correspond to the sequence of pages. - don't use self.__get_from_xml, because we want to scan lines sequentially here """ self.__data['objects'] = {} tag_name = self.__cfg.get(self.__group, "obj_tag") # now scan the xml file idx = 0 for line in fileinput.input(self.__xml_file): content = self.__extract_xml_content(line, tag_name) if content is None: continue idx += 1 tmp = {} tmp['file name'] = os.path.abspath(os.path.join(self.__base_dir, content)) # this 'index' defines the order of objects in the document tmp['index'] = idx # we must use imaginary oid's since we are reading from a file, # this OID defines the object ID in the data store, this # has nothing to do with the semantic order of objects self.__data['objects'][idx] = tmp # cleanup fileinput.close() if idx == 0: _log.warning("no files found for import") return None _log.debug("document data files to be processed: %s" % self.__data['objects']) return 1
def plot_chord_window_transfer(): window_map = {} for line in fileinput.input("data_processed/chord_window_transfer.txt"): window_1, window_2, v = line.strip().split("\t")[0].split(",")[0], line.strip().split("\t")[0].split(",")[1], line.strip().split("\t")[1] if not window_map.has_key(window_1): window_map[window_1] = {"canteen":"","total":0,"stay":0,"price":0} if window_2 == window_1: window_map[window_1]["stay"] = int(v) window_map[window_1]["total"] += int(v) fileinput.close() for line in fileinput.input("data_processed/tree_window.txt"): (canteen, window, v1, v2, v3) = line.strip().split("\t") window_map[window]["canteen"] = canteen window_map[window]["price"] = float(v3) fileinput.close() with open("data2js.txt","w") as f: for window, values in window_map.iteritems(): if values["canteen"] == "一餐": # if values["canteen"] == "二餐": # if values["canteen"] == "三餐": # if values["canteen"] == "四餐": # if values["canteen"] == "五餐": # if values["canteen"] == "六餐": # if values["canteen"] == "哈乐": f.write("["+str(round(values["price"],2))+","+str(round(100.0*values["stay"]/values["total"],2))+"],\n")
def calctrigramcount(inputfilename,trigramcountmap) : global alphabets for inputline in fileinput.input(inputfilename) : inputline = inputline.strip('\r\n') inputline = inputline.strip(' ') if len(inputline) == 0 : continue start = 0 currentletter = '' previousletter1 = '' previousletter2 = '' for character in inputline : if character == ' ' : continue currentletter = character if start == 0 : previousletter1 = 'S' previousletter2 = 'S' token = previousletter1 + previousletter2 + currentletter trigramcountmap[token] += 1.0 previousletter1 = previousletter2 previousletter2 = currentletter start = 1 continue token = previousletter1 + previousletter2 + currentletter trigramcountmap[token] += 1.0 previousletter1 = previousletter2 previousletter2 = currentletter currentletter = 'E' token = previousletter1 + previousletter2 + currentletter trigramcountmap[token] += 1.0 fileinput.close() return trigramcountmap
def parse_email(self): db = JsonDbJetty() rawfile = db.get_info('article', 'raw_file') bodyfile = db.get_info('article', 'body_file') shutil.copyfile(rawfile, bodyfile) cxt_flag = False regx = re.compile('^,,,,$') for l in fileinput.input(bodyfile, inplace=True): if cxt_flag == True: #print l sys.stdout.write(l) else: theline = l.strip() if regx.match(theline): cxt_flag = True continue if theline.startswith('class='): self.blogclass=theline.replace('class=','') continue if theline.startswith('title='): self.blogtitle=theline.replace('title=','') continue fileinput.close() if self.blogclass and self.blogtitle and cxt_flag==True: return 1 else: return 0
def consist(infile,PathToTaxonomy): TargetCategories={} outfile=open(infile+".con",'w') catdict={} for line in fileinput.input([PathToTaxonomy+"/nodes.dmp"]): DictValues=line.split('\t') catdict[DictValues[0]]=DictValues[4] fileinput.close() for line in fileinput.input([infile]): HitValues=line.strip().split('\t') TargetCategories[HitValues[0]]={'perc':HitValues[2],'species':[],'genus':[],'family':[],'order':[],'phylum':[],'class':[],'kingdom':[]} fileinput.close() for line in fileinput.input([infile]): HitValues=line.strip().split('\t') n=12 while n<len(HitValues): for cat in TargetCategories[HitValues[0]].keys(): if catdict[HitValues[n]]==cat: if HitValues[n] not in TargetCategories[HitValues[0]][cat]: TargetCategories[HitValues[0]][cat].append(HitValues[n]) n=n+1 print "Number of ids with taxonomic information:" + str(len(TargetCategories)) for ID in TargetCategories.keys(): LCA_per_ID={} for cat in TargetCategories[ID].keys(): if len(TargetCategories[ID][cat])>1: LCA_per_ID[cat]=str(','.join(TargetCategories[ID][cat])) elif len(TargetCategories[ID][cat])==0: LCA_per_ID[cat]='n'+str(len(TargetCategories[ID][cat])) else: LCA_per_ID[cat]=TargetCategories[ID][cat][0] outfile.write(ID+'\t'+TargetCategories[ID]['perc']+'\t'+LCA_per_ID['species']+'\t'+LCA_per_ID['genus']+'\t'+LCA_per_ID['family']+'\t'+LCA_per_ID['order']+'\t'+LCA_per_ID['class']+'\t'+LCA_per_ID['phylum']+'\t'+LCA_per_ID['kingdom']+'\n') outfile.close()
def readfilelist(self): file_list = self.file_list directory = self.directory table1 = self.table1 db1 = self.db1 cursor1 = self.cursor1 ast_lit = ast.literal_eval open_file_object_list = [os.path.join(directory, filename) for filename in file_list] tuple_sku = ["'%s'" %(ast_lit(line)[0]) for line in fileinput.input(open_file_object_list)] fileinput.close() sql = """update %s set upload_image_status = "YES" where product_id in (%s)""" %(table1, ", ".join(tuple_sku)) #print sql #try: cursor1.execute(sql) db1.commit() print "updated........................." #except: # db1.rollback() del open_file_object_list[:] del open_file_object_list del tuple_sku[:] del tuple_sku
def _get_format_type(self) -> str: """ Determines format type based on whether or not all records are of the same length. Returns either 'csv' or 'fixed' """ # our solution isn't accurate enough to show yet, so for now just # set to 'csv': return 'csv' #todo: make this smarter: # - Since we're not using a csv dialect we could have control # characters breaking a row into multiple lines. # - Also, a small csv file might really have all rows get the same # length. # - Also, the user may be passed in explicit csv dialect info. rec_length = collections.defaultdict(int) for rec in fileinput.input(self.fqfn): rec_length[len(rec)] += 1 if fileinput.lineno > 1000: # don't want to read millions of recs break fileinput.close() if len(rec_length) == 1: return 'fixed' else: return 'csv'
def parseFromPkg(self,pkgfile): ''' parse all bus record from pkg file ''' beginTime = datetime.now() self.pkgfile = pkgfile self.pkgdir = os.path.dirname(self.pkgfile) if not os.path.isfile(self.pkgfile): self.faildExit("pkg file [%s] is not exists" % pkgfile) self._checkPkgFile() lineno = 1 for line in fileinput.input(pkgfile): if lineno == 1: if not self._readPkgHead(line): # 分析包裹单文件头 self.faildExit("pkg head format error") elif lineno - 1 <= self.recordCount : if len(line) == 0: continue if not self._checkRecordFile(line): # 检查文件是否存在 self.faildExit("record file[%s] not exists or size error" % line) else: break lineno = lineno + 1 if len(self.recordFiles) != self.recordCount: self.faildExit("pkg file list not match real files") fileinput.close() self._makeStructDef() r,m = self._genSQL() # 生成SQL 文件 if r != 0: self.faildExit(m) self._finishRemoveFile(pkgfile) delta = datetime.now() - beginTime total = delta.seconds + delta.microseconds / 1000000.0 print "Success : [%s] total[%d] time[%.2f]second" % (self.SQLFile,self.totalRecord,total)
def _checkMd5(self,tarfile): ''' 验证 MD5 ''' md5file = self._getMD5FileName(tarfile) if not os.path.exists(md5file): print "md5 file not exists[%s]" % md5file return False m = None if sys.hexversion >= 0x02050000: import hashlib m = hashlib.md5() else: import md5 m = md5.md5() tarhandle = open(tarfile,'rb') while True: data = tarhandle.read(1024) if len(data) > 0: m.update(data) if len(data)<1024: break tarhandle.close() digest = m.hexdigest().upper() md5fileDigest = '' for line in fileinput.input(md5file): md5fileDigest = line.upper().strip() break fileinput.close() if digest == md5fileDigest[:32]: return True print "digest[%s][%s]" % (digest ,md5fileDigest) return False
def cleanup(self): """ Called only if PackStack Setup is successful. Returns modified system environment to default. It performs the following activities: 1. Enable Repositories, during installation default repos are disabled {for offline install} (NOTE: EPEL repo is not changed back to default, it still points to /root/cloud/) 2. Remove rabbitmq entry from /etc/hosts file 3. Disable initial-setup-text service """ count = 3 for line in fileinput.input(ROOT_PATH + "/etc/yum.repos.d/CentOS-Base.repo", inplace=True): if line.startswith("enabled"): if count > 0: count -= 1 print(line.replace("enabled=0", "enabled=1").rstrip("\n")) else: print line, else: print line, fileinput.close() for line in fileinput.input(ROOT_PATH + '/etc/hosts', inplace=True): print(line.replace("127.0.0.1 www.rabbitmq.com", "").rstrip("\n")) fileinput.close() ret = iutil._run_systemctl("disable", "initial-setup-text") if ret: print ("Failed to Disable INITIAL-SETUP-UTILITY\n") return False os.remove(os.path.normcase(ROOT_PATH + "/var/www/html/rabbitmq-signing-key-public.asc")) return True
def gather(): util.debug("Gathering result") job_results = results paths.ensure(job_results) #aazj00103_05.075_v1_3.bz2 zj001.fasta zj001.psipred_ss2 #aazj00109_05.075_v1_3.bz2 zj001.psipred aa_pattern = "aa"+name+"(03|09)\_05\.075\_v1\_3$" patterns = [aa_pattern] for suf in [".fasta", ".psipred", ".psipred_ss2"]: patterns.append(name+suf) try: for pattern in patterns: print "Pattern ",pattern match = paths.find(pattern, scr_job) if not match: raise "Missing file" for path in match: if pattern == aa_pattern: print "Filtering Columns %s" % path for line in fileinput.input(path, inplace=1): print " "+line[:47].strip() fileinput.close() print "Path ", path file = paths.getFile(path) dest = paths.join(job_results, file) util.copy(path, job_results) if pattern == aa_pattern: util.system("bzip2 %s" % dest) except: paths.removerf(job_results) raise
def main(args): ap = argparse.ArgumentParser() ap.add_argument('files', nargs='*', help='files to sort') ap.add_argument('-r', '--reverse', action='store_true', default=False, help='reverse the result of comparisons') ns = ap.parse_args(args) def _print(lines): if lines is not None: lines = sorted(lines) if ns.reverse: lines = lines[::-1] print(''.join(lines)) fileinput.close() # in case it is not closed try: lines = None for line in fileinput.input(ns.files): if fileinput.isfirstline(): _print(lines) lines = [] lines.append(line) _print(lines) finally: fileinput.close()
def method_obfuscation(self): """ Method name obfuscation in invoke-families and fields ->function() ==> ->encrypt() Lclass; => Lencrypt; field: => encrypt_field: [invoke pattern handling] 1. check -> 2. change method name() 1) if method name belongs to classname_list : change 2) if method name belongs to API : not change 3. change refered field name() 1) skip R.class now - affect layout, ids, public, class's local "definition" 2) change others - iget-boolean v0, p0, Lcn/smstelphoneapp/service/STAService;->g:Z => ->enc:Z 4. change class name() * MEMO- A:class (field A from class) [method pattern handling] [field definition handling] [etc pattern handling] """ print "[*] Method name obfuscation" for index in range(len(self.smali_path)): full_path = ret_fullpath(self.target, self.smali_path[index],\ self.smali_filename[index]) only_filename = self.smali_filename[index] #logging.info(only_filename) for line in fileinput.input(full_path, inplace=1): #for line in open(full_path,'r').readlines(): changed = False "invoke_pattern change ->" # 1. check -> if ";->" in line: left_class_name = ret_class_from_method_call(line, first=True) # 2.1 - if class exist in scanned class list: if self.check_classname_exist(left_class_name) == True: # 2.2 - if method name belong to original class => mod #logging.info(extract_method_name(line)) #logging.info(line.strip()) if self.is_method_in_class(extract_method_name(line), \ left_class_name): line = mod_method_call_name(line).rstrip() + "\n" # 3 - change referred field name() if ':' in line: left_class_name = ret_class_from_method_call( line, first=True) #print "debug:"+left_class_name if self.check_classname_exist(left_class_name) == True: line = mod_field_reference(line) # 4 - change classname() line = self.mod_line_class(line) # MEMO for test sys.stdout.write(line) continue elif is_method_pattern(line): ".method pattern change, execpt for blacklist_functions" #logging.info(line.strip()) line = mod_method_define_name(line) + "\n" line = self.mod_line_class(line) #logging.info(line) # MEMO for test sys.stdout.write(line) continue # .field definition pattern? elif '.field ' in line: "find field definition section" # TODO : have to handle corner case with double LL # e.g., .field public static final IAB_LEADERBOARD:Lcom/google/ads/AdSize; # L1234:Lcom/a/a/c; => what is between L~; ? line = mod_field_define_name(line) if ':' in line: first = line.split(':')[0] second = line.split(':')[1:] second = ''.join(second) line = first + ":" + self.mod_line_class(second) # MEMO for test sys.stdout.write(line) continue elif 'const-string ' in line: reg, string = ret_string_reg(line) if is_fieldname_in_blacklist(string): sys.stdout.write(line) continue if string in self.field_list: if '/' in line: line = line.replace("\"" + string + "\"", "\"" + rot13(string) + "\"") else: line = line.replace( "\"" + string + "\"", "\"" + replace_upper_L(rot13(string)) + "\"") sys.stdout.write(line) continue #elif is_class_reference(line): """ Just using class instance => should handle ex) const-class v1, Lcn/smstelphoneapp/service/STAService this handles: .class definition and const-class... """ #line = self.mod_line_class(line) #print "debug:class_reference" #if 'obrfcwr/giddcfh/j7/odd/OddQcndohOqhwjwhm' in line: # logging.info('HERE') line = self.mod_line_class(line) sys.stdout.write(line) fileinput.close()
# X, labels = X[0:100], labels[0:100] # Y = tsne(X, 2, 50, 20.0); # Plot.scatter(Y[:,0], Y[:,1], 20, labels); # plt.scatter(Y[:,0], Y[:,1], 20, labels) # plt.show() vectormap = {} for line in fileinput.input("../../data/word2vec/vectors.weibo.txt"): try: word, vector = line.strip().split("\t")[0], [ float(i) for i in line.strip().split("\t")[1].split(" ") ] vectormap[word] = vector except: continue fileinput.close() W, X, y = [], [], [] for line in fileinput.input("data/pos_eva.sort.txt"): word = line.strip() if vectormap.has_key(word) and len(word.decode("utf-8")) >= 2: W.append(word) X.append(vectormap[word]) y.append("1") if len(W) == 200: break fileinput.close() for line in fileinput.input("data/pos_emo.sort.txt"): word = line.strip() if vectormap.has_key(word) and len(word.decode("utf-8")) >= 2: W.append(word) X.append(vectormap[word])
def parsefile(filename_ppd='../data/static_info_ppd.csv', filename_zmq='../data/static_info_zmq.txt', write_overlap=False): def parse(f_tags, f_score, f_averageProfit, f_registMoney, f_autobid, f_stockTransfer, f_fundsToken, f_guaranteeMode, f_guaranteeOrg, f_lauchTime, f_category, f_lng, f_lat): def extr_tags(x): types = ['国资系', '上市公司系', '银行系', '民营系'] # 公司类别 others = ['投之家合作平台', '股权上市', '接受过风投', '争议', '加入第三方征信', '加入协会'] r = [0] * 7 tags = x.split(',') for i in xrange(len(tags)): if tags[i] in types: r[0] = types.index(tags[i]) + 1 if tags[i] in others: r[others.index(tags[i]) + 1] = 1 return r def extr_ones(x, cut): x = x.strip(cut) return 0 if x == '' else float(x) def extr_autobid(x): return 0 if x == '' else 1 if x == '支持' else -1 def extr_stockTransfer(x): return -1 if x == '' else 0 if x == '随时' else 12 if x == '1年' else 300 if x == '不可转让' else x.strip( '个月') def extr_fundsToken(x): return 0 if x == '' or x == '无托管' else 1 def extr_ifGuarantee(x): return 0 if x == '' else 1 def extr_lauchTime(x): import datetime def date_difference(d1, d2): if '-' in d1 and '-' in d2: d1 = datetime.datetime.strptime(d1 + ' 00:00:00', '%Y-%m-%d %H:%M:%S') else: d1 = datetime.datetime.strptime(d1 + ' 00:00:00', '%Y年%m月%d日 %H:%M:%S') d2 = datetime.datetime.strptime(d2 + ' 00:00:00', '%Y-%m-%d %H:%M:%S') return (d2 - d1).days / 30 return 6 if x == '' else date_difference(x, '2016-05-01') def extr_category(x): _dict = {'股份合作企业':0, '私营企业':1, '港、澳、台投资企业':2, '股份制企业':3, \ '集体所有制企业':4, '外商投资企业':5, '国有企业':6, '联营企业':7} return _dict.get(x, -1) return extr_tags(f_tags)+\ [extr_ones(f_score,''),\ extr_ones(f_averageProfit,'%'),\ extr_ones(f_registMoney,' 万元'),\ extr_autobid(f_autobid),\ extr_stockTransfer(f_stockTransfer),\ extr_fundsToken(f_fundsToken),\ extr_ifGuarantee(f_guaranteeMode),\ extr_ifGuarantee(f_guaranteeOrg),\ extr_lauchTime(f_lauchTime),\ extr_category(f_category),\ extr_ones(f_lng,''),\ extr_ones(f_lat,'')] ppd_platforms = {} zmq_platforms = {} # 生成ppd静态特征 with open(filename_ppd, 'rb') as csvfile_ppd: reader = csv.DictReader(csvfile_ppd) for row in reader: parsed = parse(row['tags'],row['score'],row['averageProfit'],row['registMoney'],row['autobid'],row['stockTransfer'],row['fundsToken'],\ row['guaranteeMode'],row['guaranteeOrg'],row['lauchTime'],row['category'],row['lng'],row['lat']) ppd_platforms[row['platName']] = parsed # 生成zmq静态特征 for line in fileinput.input(filename_zmq): if fileinput.lineno() == 1: field_names = line.strip().split('\t') else: row = { name: field for name, field in zip(field_names, line.strip().split('\t')) } parsed = parse('','',row.get('平均收益',''),row.get('注册资本',''),row.get('自动投标',''),row.get('债权转让',''),row.get('资金托管',''),\ row.get('保障模式',''),'',row.get('上线时间',''),row.get('公司类型',''),'','') zmq_platforms[row['平台名称']] = parsed if fileinput.lineno() == ZMQ_Nline: break fileinput.close() if write_overlap: print '重合平台:', len( set(ppd_platforms.keys()) & set(zmq_platforms.keys())) with open('overlap_platforms.txt', 'w') as outfile: outfile.write('\n'.join( list(set(ppd_platforms.keys()) & set(zmq_platforms.keys())))) return ppd_platforms, zmq_platforms
def statistic(filename, bycol='INFOSOURCENAME', outdir='statistic_infosource', time_granularity='%m', Nmonth=20): # TBD: use collections.defaultdict() case_dict = {} for line in fileinput.input(filename): if fileinput.lineno() % 10**4 == 0: print sys.stdout.write(str(fileinput.lineno()) + '\r') sys.stdout.flush() TASKID, COORDX, COORDY, INFOSOURCENAME, DISCOVERTIME, SOLVINGTIME, \ ADDRESS, STREETNAME, DESCRIPTION, EXECUTEDEPTNAME, URGENTDEGREE, USEREVALUATE, \ INFOBCNAME, INFOSCNAME, INFOZCNAME, CASEENDBCNAME, CASEENDSCNAME = map(lambda x:x.strip(), line.decode('utf-8').split(u'\t')) INFOBCNAME, INFOSCNAME, INFOZCNAME = map( lambda x: re.sub(ur'\(浦东\)', '', x), [INFOBCNAME, INFOSCNAME, INFOZCNAME]) if COORDX and COORDY and float(COORDX) and float( COORDY) and DISCOVERTIME: timeslot = time.strptime( DISCOVERTIME, '%Y/%m/%d %H:%M:%S').tm_mon + ( 12 if DISCOVERTIME.startswith('2016') else 0) if bycol == 'STREETNAME': COL = STREETNAME elif bycol == 'EXECUTEDEPTNAME': COL = EXECUTEDEPTNAME elif bycol == 'INFOSOURCENAME': COL = INFOSOURCENAME else: raise Exception('Column is not supported.') case_dict[COL] = case_dict.get(COL, {}) case_dict[COL][INFOBCNAME] = case_dict[COL].get( INFOBCNAME, [0] * 24) case_dict[COL][INFOBCNAME][timeslot - 1] += 1 fileinput.close() compare = lambda array, delta: [ u'{0:+.2f}'.format(1. * (array[i] - array[i - delta]) / array[i - delta]) if i - delta >= 0 and array[i - delta] else u'-' for i in xrange(len(array)) ] for i, BCNAME in enumerate(category1 + category2): with open('{0}/{1}{2}.txt'.format(outdir, i, BCNAME.encode('utf-8')), 'w') as outfile: outfile.write(u'{0}\t{1}\n'.format( bycol.decode('utf-8'), u'\t'.join([ u'{0}年{1}月({2})'.format(year, month, title) for year in (2015, 2016) for month in xrange(1, 13) for title in (u'次数', u'对比上月', u'对比上年同期') ][:Nmonth * 3])).encode('utf-8')) for COL, cases in case_dict.iteritems(): if BCNAME in cases and BCNAME: outfile.write(u'{0}\t{1}\n'.format( COL, u'\t'.join([ u'{0}\t{1}\t{2}'.format(c1, c2, c3) for c1, c2, c3 in zip(cases[BCNAME], compare(cases[BCNAME], 1), compare(cases[BCNAME], 12)) ][:Nmonth])).encode('utf-8')) with open('{0}/_event.txt'.format(outdir, bycol), 'w') as outfile: outfile.write(bycol + '\t' + '\t'.join( BCNAME.encode('utf-8') for _, BCNAME in enumerate(category1 + category2)) + '\n') for COL, cases in case_dict.iteritems(): outfile.write( COL.encode('utf-8') + '\t' + '\t'.join([ str(sum(case_dict[COL].get(BCNAME, [0]))) for BCNAME in category1 + category2 ]) + '\n') with open('{0}/_sumup.txt'.format(outdir, bycol), 'w') as outfile: outfile.write(bycol + '\t' + '事件总数' + '\n') for COL, cases in case_dict.iteritems(): outfile.write( COL.encode('utf-8') + '\t' + str( sum([ sum(case_dict[COL].get(BCNAME, [0])) for BCNAME in category1 + category2 ])) + '\n')
def parse(filename, skipPhoto=True): """ A function to parse the cosima output simulation file. The function returns a simulation object. Example Usage: simulation = EventViewer.parse(filename) """ # Start an event counter currentEventNumber = 0 # Loop through each line of the file for line in fileinput.input([filename]): # Create the first event if 'SE' in line and currentEventNumber == 0: # Create a new simulation object to store all of the events in this run simulation = Simulation() # Create a new event event = Event() # Create a new object to store the interactions for this event interactions = Interactions() # Create a new object to store the hits for this event hits = Hits() # Increment the event number currentEventNumber = currentEventNumber + 1 # Store the existing event and create a new event elif 'SE' in line or 'EN' in line: # Store the interaction and hit objects in their respective event event.interactions = interactions event.hits = hits # Store the current event in the simulation object simulation.events.append(event) # Create a new event event = Event() # Create a new object to store the interactions for the new event interactions = Interactions() # Create a new object to store the hits for the new event hits = Hits() # Increment the event number currentEventNumber = currentEventNumber + 1 # Get the event ID if 'ID' in line and currentEventNumber != 0: event.id_trigger = line.split()[1] event.id_simulatedEvent = line.split()[2] # Get the event time if 'TI' in line and currentEventNumber != 0: event.time = line.split()[1] # Get the total deposited energy if 'ED' in line and currentEventNumber != 0: event.depositedEnergy = line.split()[1] # Get the total escaped energy if 'EC' in line and currentEventNumber != 0: event.escapedEnergy = line.split()[1] # Get the total deposited energy in non-sensative material if 'NS' in line and currentEventNumber != 0: event.depositedEnergy_NonSensitiveMaterial = line.split()[1] # if 'IA' in line and 'PHOT' not in line: if 'IA' in line: # Skip photoelectric interactions if skipPhoto == True: if 'PHOT' in line: continue # Split the line LineContents = line.split(';') # Parse each line and place the extracted information into their respective arrays interactions.interactionType.append(LineContents[0].split()[1].split()[0]) interactions.ID_interaction.append(LineContents[0].split()[2].split()[0]) interactions.ID_parentInteraction.append(LineContents[1].split()[0]) interactions.ID_detector.append(LineContents[2]) interactions.timeStart.append(float(LineContents[3])) interactions.x.append(float(LineContents[4])) interactions.y.append(float(LineContents[5])) interactions.z.append(float(LineContents[6])) interactions.ID_parentParticleType.append(LineContents[7].split()[0]) interactions.x_newDirection_OriginalParticle.append(float(LineContents[8])) interactions.y_newDirection_OriginalParticle.append(float(LineContents[9])) interactions.z_newDirection_OriginalParticle.append(float(LineContents[10])) interactions.x_polarization_OriginalParticle.append(LineContents[11]) interactions.y_polarization_OriginalParticle.append(LineContents[12]) interactions.z_polarization_OriginalParticle.append(LineContents[13]) interactions.newKineticEnergy_OriginalParticle.append(LineContents[14]) interactions.ID_childParticleType.append(LineContents[15]) interactions.x_direction_NewParticle.append(float(LineContents[16])) interactions.y_direction_NewParticle.append(float(LineContents[17])) interactions.z_direction_NewParticle.append(float(LineContents[18])) interactions.x_polarization_NewParticle.append(LineContents[19]) interactions.y_polarization_NewParticle.append(LineContents[20]) interactions.z_polarization_NewParticle.append(LineContents[21]) interactions.newKineticEnergy_NewParticle.append(LineContents[22].rstrip()) if 'INIT' in line: event.initialEnergy = interactions.newKineticEnergy_NewParticle[-1] # Create a unique particle id to track parent and child particles ID_parentParticle = interactions.ID_parentInteraction[-1] + '_' + interactions.ID_parentParticleType[-1] ID_childParticle = interactions.ID_interaction[-1] + '_' + interactions.ID_childParticleType[-1] # if ID_childParticleType == '1': # ID_childParticle = ID_parentInteraction + '_' + ID_childParticleType # else: # ID_childParticle = ID_interaction + '_' + ID_childParticleType # Store the information for the individual particles associated with this interaction # Record the particle trajectory if ID_parentParticle in event.particleInformation: event.particleInformation[ID_parentParticle]['x'].append(interactions.x[-1]) event.particleInformation[ID_parentParticle]['y'].append(interactions.y[-1]) event.particleInformation[ID_parentParticle]['z'].append(interactions.z[-1]) event.particleInformation[ID_parentParticle]['time'].append(interactions.timeStart[-1]) else: event.particleInformation[ID_parentParticle] = {} event.particleInformation[ID_parentParticle]['x'] = [interactions.x[-1]] event.particleInformation[ID_parentParticle]['y'] = [interactions.y[-1]] event.particleInformation[ID_parentParticle]['z'] = [interactions.z[-1]] event.particleInformation[ID_parentParticle]['time'] = [interactions.timeStart[-1]] if ID_childParticle in event.particleInformation: event.particleInformation[ID_childParticle]['x'].append(interactions.x[-1]) event.particleInformation[ID_childParticle]['y'].append(interactions.y[-1]) event.particleInformation[ID_childParticle]['z'].append(interactions.z[-1]) event.particleInformation[ID_childParticle]['time'].append(timeStart[-1]) else: event.particleInformation[ID_childParticle] = {} event.particleInformation[ID_childParticle]['x'] = [interactions.x[-1]] event.particleInformation[ID_childParticle]['y'] = [interactions.y[-1]] event.particleInformation[ID_childParticle]['z'] = [interactions.z[-1]] event.particleInformation[ID_childParticle]['time'] = [interactions.timeStart[-1]] # Record the hit information if 'HTsim' in line: # Split the line LineContents = line.split(';') # Extract the hit information hits.detector.append(int(LineContents[0].split(' ')[1])) hits.x.append(float(LineContents[1])) hits.y.append(float(LineContents[2])) hits.z.append(float(LineContents[3])) hits.energy.append(float(LineContents[4])) # Close the input file fileinput.close() return simulation
def test_state_is_None(self): """Tests that fileinput.close() does nothing if fileinput._state is None""" fileinput._state = None fileinput.close() self.assertIsNone(fileinput._state)
def replaceKey(fileName, key, value): for line in fileinput.FileInput(fileName, inplace=1): if key in line: line = line.replace(key, value) sys.stdout.write(line) fileinput.close()
def _sudoFichGrid_(nomFich, mode, pr=False): '''Lit le fichier de caractères et fait les vérifications. Si mode = 1 (défaut) : retourne une liste de lignes Si mode = 2 : retourne une liste de listes ''' if nomFich == None: raise Sudoku_Error \ ("Pas de nom de fichier - Abandon") if mode not in (1,2): raise Sudoku_Error ("Mode de lecture de fichier invalide") if pr: display("Lecture du fichier ", nomFich, " :") listLines = list() lineno = 0 try: for line in fileinput.input(nomFich): #si la ligne commence par # l'ignorer if line[0] == '#': continue #erreur s'il y a plus de 9 lignes valides dans le fichier if lineno > 9: raise Sudoku_Error \ ("Le fichier contient trop de données. Lecture "\ "interrompue. La grille a été remplie.") #vérifier les caractères et rectifier les vides line2 = "" for c in line: #ignorer les espaces if c == ' ': continue #traiter les équivalents à l'absence de chiffre if c in ('0', '.', '-', '_'): line2 = line2 + '0' #accepter uniquement de '1' à '9' elif str(0) <= str(c) <= str(9): line2 = line2 + str(c) #fin de la ligne ignorer le '\n' final elif c == '\n': break else: raise Sudoku_Error \ ("caractère invalides dans la ligne : " + line) #si la ligne est complètement blanche passer à la suivante #sans la compter if len(line2) == 0: continue #erreur si la ligne contient plus ou moins de 9 chiffres if len(line2) != 9: raise Sudoku_Error \ ("la ligne" + str(lineno) + " : " + line2 + \ " ne contient pas exactement 9 chiffres. ") #ok, la ligne est valide - Ajout à la liste if mode == 1: #mode liste de lignes listLines.append(line2) elif mode == 2: #mode liste de listes listeval = list() for c in line2: listeval.append(int(c)) listLines.append(listeval) else: raise Sudoku_Error ("Mode de lecture de fichier invalide") lineno = lineno + 1 if pr: display("ligne" + str(lineno) + " : " + line2) #end for except FileNotFoundError: raise Sudoku_Error ("Fichier invalide ou n'existe pas") return None finally: fileinput.close() #erreur s'il y a eu moins de 9 lignes valides dans le fichier if lineno < 9: raise Sudoku_Error \ ("Erreur de lecture, le fichier contient moins de 9 lignes "\ "de chiffres.") #ok, retourne la liste des 9 lignes de 9 chiffres if pr: display("Ok, 9 lignes de 9 chiffres.") return listLines
def main(args): p = argparse.ArgumentParser(description=__doc__) p.add_argument( "-c", "--bytes", default="", type=str, metavar='K', help="""output the last K bytes; or -c +K starting with the Kth""") p.add_argument("-f", "--follow", action="store_true", help="""follow specified files""") p.add_argument("-n", "--lines", default="10", type=str, metavar='K', help="""print the last K lines instead of 10; or use -n +K to print lines starting with the Kth""") p.add_argument("-q", "--quiet", "--silent", action='store_true', help="never print headers for each file") p.add_argument("-v", "--verbose", action='store_true', help="always print headers for each file") p.add_argument( "-s", "--sleep-interval", type=float, default=1.0, help= "with -f, sleep for approximately N seconds (default 1.0) between iterations." ) p.add_argument("files", action="store", nargs="*", help="files to print") ns = p.parse_args(args) status = 0 if len(ns.files) == 0: ns.files = ['-'] if ns.follow and '-' in ns.files: print('tail: warning: following stdin indefinitely is ineffective') if ns.bytes: use_bytes = True if ns.bytes[0] == '+': from_start = True else: from_start = False count = abs(int(ns.bytes)) # '-n -3' is equivalent to '-n 3' else: use_bytes = False if ns.lines[0] == '+': from_start = True else: from_start = False count = abs(int(ns.lines)) # '-n -3' is equivalent to '-n 3' try: for i, fname in enumerate(ns.files): if ns.verbose or (len(ns.files) > 1 and not ns.quiet): write_header(fname if fname != '-' else 'standard input') try: if fname == '-': f = sys.stdin else: f = open(fname) buf = [] j = -1 while True: j += 1 if use_bytes: l = f.read(1) else: l = f.readline() if not l: break buf.append(l) if from_start: if j >= count - 1: break elif len(buf) > count: del buf[0] for item in buf: print(item, end='') if i == len(ns.files) - 1 and ns.follow: for l in tail_f(f, ns.sleep_interval): print(l, end='') sys.stdout.flush() finally: if fname != '-': f.close() except Exception as e: print('tail :%s' % str(e)) status = 1 finally: fileinput.close() sys.exit(status)
def main(args, fout=sys.stdout): # print "\t".join(["seqnames", "start", "end", "name", "utr_length", "strand", #"lastexon_cds_start", "lastexon_cds_end", "name2", #"exonStarts", "exonEnds"]) # conn = sqlite3.connect(args.db) # query = "select gene_biotype, transcript_biotype from ensembl_id where transcript_id = ?" conn = pd.read_table(args.db) conn = conn.loc[:, ['Transcript stable ID', 'Gene type', 'Transcript type' ]].drop_duplicates() conn = conn.set_index(['Transcript stable ID']) c = 0 n = 0 for row in fileinput.input(args.annotation_file[0], openhook=fileinput.hook_compressed): if fileinput.isfirstline() and not args.no_header: continue n = n + 1 if re.match(r"^#", row): c = c + 1 continue rowobj = Row(row, args.no_header) if not args.no_skip_random_chromosomes and \ rowobj.is_on_random_chromosome(): c = c + 1 continue # filter for only protein-coding genes # result = conn.execute(query, (rowobj.get_stripped_name(),)) # result = result.fetchone() # if result is None or \ # not (result[0] == "protein_coding" and \ # result[1] == "protein_coding"): # c = c + 1 # continue # filter for only protein-coding genes try: result = conn.loc[rowobj.get_stripped_name()] if isinstance(result, pd.DataFrame): result = result.iloc[0, ] if not (result['Gene type'] == "protein_coding" and result['Transcript type'] == "protein_coding"): c = c + 1 continue except KeyError: c = c + 1 continue bed = rowobj.extract_last_exon() if bed is not None: fout.write("\t".join([str(x) for x in bed]) + "\n") else: c = c + 1 fileinput.close() # conn.close() if float(c) / float(n) > 0.75: print("Warning: %d/%d (%0.2f%%) were skipped. Are you using the " "correct database?" % (c, n, float(c) / float(n)), file=sys.stderr)
def __init__(self, lineno=-1, linecontent="", message=""): self.lineno = lineno self.linecontent = linecontent self.message = message fileinput.close()
def demonstrate_tide_effect(): day = [[[[0] * ranget for j in xrange(rangey)] for i in xrange(rangex)] for d in xrange(8)] for d, filename in enumerate(sorted( glob.glob(r"../data/pos_hour_user#/*"))): print filename for line in fileinput.input(filename): part = line.strip().split(" ") px, py, s, c = int(part[0].split(",")[0]), int( part[0].split(",")[1]), int(part[1]), int(part[2]) day[d][px][py][s] = c fileinput.close() for dlist, fname in [(range(1, 6), "weekday"), (range(0, 1) + range(6, 8), "weekend")]: mask = np.array([[1 if np.array([[day[df][i][j][kf] for kf in xrange(ranget)] for df in dlist]).sum()/len(dlist)>=10*ranget else 0 \ for j in xrange(rangey)] for i in xrange(rangex)]).sum() mesh = [[[sum([day[d][i][j][k] for d in dlist])/len(dlist) if np.array([[day[df][i][j][kf] for kf in xrange(ranget)] for df in dlist]).sum()/len(dlist)>=10*ranget else 0 for k in xrange(ranget)] \ for j in xrange(rangey)] for i in xrange(rangex)] mesh = [[[float(mesh[i][j][k])/sum(mesh[i][j]) if sum(mesh[i][j])!=0 else 0 for k in xrange(ranget)] \ for j in xrange(rangey)] for i in xrange(rangex)] avg = [ float( np.array([[mesh[i][j][k] for j in xrange(rangey)] for i in xrange(rangex)]).sum()) / mask for k in xrange(ranget) ] mesh = [[[mesh[i][j][k]-avg[k] if sum(mesh[i][j])!=0 else 0 for k in xrange(ranget)] \ for j in xrange(rangey)] for i in xrange(rangex)] with open("../data/var/{0}.txt".format(fname), "w") as f: for i in xrange(rangex): for j in xrange(rangey): if sum(mesh[i][j]) != 0: f.write("{0} {1} {2}\n".format( i, j, " ".join([str(round(x, 6)) for x in mesh[i][j]]))) plt.figure(figsize=(12, 8)) levels = arange(-1, 1.1, 0.1) cmap, norm = cm.PRGn, cm.colors.Normalize(vmax=1.1, vmin=-1) for c, t in enumerate([4, 8, 10, 16, 18, 22]): colormap = [[0 for j in xrange(rangey)] for i in xrange(rangex)] for line in fileinput.input("../data/var/weekday.txt"): part = line.strip().split(" ") x, y, f = int(part[0]), int(part[1]), float(part[2:][t]) colormap[x][y] = f fileinput.close() cmax = np.array([[abs(colormap[i][j]) for j in xrange(rangey)] for i in xrange(rangex)]).max() colormap = [[colormap[i][j] / cmax for j in xrange(rangey)] for i in xrange(rangex)] (X, Y), C = meshgrid(np.arange(100), np.arange(100)), np.array(colormap)[20:120, 20:120] subplot(2, 3, c + 1) cset = contourf(X, Y, C.T, levels, cmap=cm.get_cmap("seismic", len(levels)), norm=norm) plt.axis([0, 100 - 1, 0, 100 - 1]) plt.xticks(np.linspace(0, 100, 6)) plt.yticks(np.linspace(0, 100, 6)) plt.title('{0}:00'.format(str(t).zfill(2))) if c == 0: plt.xlabel('Longitude grid index /200m') plt.ylabel('Latitude grid index /200m') if c == 3: subplots_adjust(hspace=0.4) subplots_adjust(bottom=0.1, left=0.06, right=0.9, top=0.9) cax2 = axes([0.92, 0.10, 0.01, 0.8]) colorbar(cax=cax2) # show() for postfix in ('eps', 'png'): savefig('../figure/{0}/11.{0}'.format(postfix))
def _closeImageFile(self): close() del self._file # We need to delete and free file variable otherwise windows couldn't move current file self._file = None
def main(): print "Processing Info.plist files..." MAJORSTR = "" MINORSTR = "" BUGFIXSTR = "" PLUG_VER_STR = "" BUNDLE_MFR = "" BUNDLE_NAME = "" PLUG_NAME_STR = "" PLUG_MFR_NAME_STR = "" PLUG_CHANNEL_IO = "" PLUG_COPYRIGHT = "" PLUG_UID = "" PLUG_MFR_UID = "" PLUG_FACTORY = "" PLUG_ENTRY = "" PLUG_VIEW_ENTRY = "" PLUG_IS_INST = 0 PLUG_DOES_MIDI = 0 # extract values from resource.h for line in fileinput.input(projectpath + "/resource.h", inplace=0): if "#define PLUG_VER " in line: PLUG_VER_STR = string.lstrip(line, "#define PLUG_VER ") PLUG_VER = int(PLUG_VER_STR, 16) MAJOR = PLUG_VER & 0xFFFF0000 MAJORSTR = str(MAJOR >> 16) MINOR = PLUG_VER & 0x0000FF00 MINORSTR = str(MINOR >> 8) BUGFIXSTR = str(PLUG_VER & 0x000000FF) if "#define PLUG_NAME " in line: PLUG_NAME_STR = string.lstrip(line, "#define PLUG_NAME ") if "#define PLUG_MFR " in line: PLUG_MFR_NAME_STR = string.lstrip(line, "#define PLUG_MFR ") if "#define BUNDLE_MFR " in line: BUNDLE_MFR = string.lstrip(line, "#define BUNDLE_MFR ") if "#define BUNDLE_NAME " in line: BUNDLE_NAME = string.lstrip(line, "#define BUNDLE_NAME ") if "#define PLUG_CHANNEL_IO " in line: PLUG_CHANNEL_IO = string.lstrip(line, "#define PLUG_CHANNEL_IO ") if "#define PLUG_COPYRIGHT " in line: PLUG_COPYRIGHT = string.lstrip(line, "#define PLUG_COPYRIGHT ") if "#define PLUG_UNIQUE_ID " in line: PLUG_UID = string.lstrip(line, "#define PLUG_UNIQUE_ID ") if "#define PLUG_MFR_ID " in line: PLUG_MFR_UID = string.lstrip(line, "#define PLUG_MFR_ID ") if "#define PLUG_ENTRY " in line: PLUG_ENTRY = string.lstrip(line, "#define PLUG_ENTRY ") if "#define PLUG_FACTORY " in line: PLUG_FACTORY = string.lstrip(line, "#define PLUG_FACTORY ") if "#define PLUG_VIEW_ENTRY " in line: PLUG_VIEW_ENTRY = string.lstrip(line, "#define PLUG_VIEW_ENTRY") if "#define PLUG_IS_INST " in line: PLUG_IS_INST = int(string.lstrip(line, "#define PLUG_IS_INST "), 16) if "#define PLUG_DOES_MIDI " in line: PLUG_DOES_MIDI = int( string.lstrip(line, "#define PLUG_DOES_MIDI "), 16) FULLVERSIONSTR = MAJORSTR + "." + MINORSTR + "." + BUGFIXSTR #strip quotes and newlines PLUG_VER_STR = PLUG_VER_STR[0:-1] BUNDLE_MFR = BUNDLE_MFR[1:-2] BUNDLE_NAME = BUNDLE_NAME[1:-2] PLUG_NAME_STR = PLUG_NAME_STR[1:-2] PLUG_MFR_NAME_STR = PLUG_MFR_NAME_STR[1:-2] PLUG_CHANNEL_IO = PLUG_CHANNEL_IO[1:-2] PLUG_COPYRIGHT = PLUG_COPYRIGHT[1:-2] PLUG_MFR_UID = PLUG_MFR_UID[1:-2] PLUG_UID = PLUG_UID[1:-2] PLUG_FACTORY = PLUG_FACTORY[0:-1] PLUG_ENTRY = PLUG_ENTRY[0:-1] PLUG_VIEW_ENTRY = PLUG_VIEW_ENTRY[0:-1] CFBundleGetInfoString = BUNDLE_NAME + " v" + FULLVERSIONSTR + " " + PLUG_COPYRIGHT CFBundleVersion = FULLVERSIONSTR CFBundlePackageType = "BNDL" CSResourcesFileMapped = True fileinput.close() LSMinimumSystemVersion = "10.7.0" BASE_SDK = "macosx10.13" DEPLOYMENT_TARGET = "10.7.0" # extract values from common.xcconfig for line in fileinput.input(projectpath + "/../../common.xcconfig", inplace=0): if not "//" in line: if "BASE_SDK = " in line: BASE_SDK = string.lstrip(line, "BASE_SDK = ") # if "MACOSX_DEPLOYMENT_TARGET = " in line: # DEPLOYMENT_TARGET = string.lstrip(line, "MACOSX_DEPLOYMENT_TARGET = ") BASE_SDK = BASE_SDK[0:-1] # DEPLOYMENT_TARGET = DEPLOYMENT_TARGET[0:-1] # DEPLOYMENT_TARGET += ".0" LSMinimumSystemVersion = DEPLOYMENT_TARGET # VST3 plistpath = projectpath + "/resources/" + BUNDLE_NAME + "-VST3-Info.plist" vst3 = plistlib.readPlist(plistpath) vst3['CFBundleExecutable'] = BUNDLE_NAME vst3['CFBundleGetInfoString'] = CFBundleGetInfoString vst3[ 'CFBundleIdentifier'] = "com." + BUNDLE_MFR + ".vst3." + BUNDLE_NAME + "" vst3['CFBundleName'] = BUNDLE_NAME vst3['CFBundleVersion'] = CFBundleVersion vst3['CFBundleShortVersionString'] = CFBundleVersion vst3['LSMinimumSystemVersion'] = LSMinimumSystemVersion vst3['CFBundlePackageType'] = CFBundlePackageType vst3['CFBundleSignature'] = PLUG_UID vst3['CSResourcesFileMapped'] = CSResourcesFileMapped plistlib.writePlist(vst3, plistpath) replacestrs(plistpath, "//Apple//", "//Apple Computer//") # VST2 plistpath = projectpath + "/resources/" + BUNDLE_NAME + "-VST2-Info.plist" vst2 = plistlib.readPlist(plistpath) vst2['CFBundleExecutable'] = BUNDLE_NAME vst2['CFBundleGetInfoString'] = CFBundleGetInfoString vst2[ 'CFBundleIdentifier'] = "com." + BUNDLE_MFR + ".vst2." + BUNDLE_NAME + "" vst2['CFBundleName'] = BUNDLE_NAME vst2['CFBundleVersion'] = CFBundleVersion vst2['CFBundleShortVersionString'] = CFBundleVersion vst2['LSMinimumSystemVersion'] = LSMinimumSystemVersion vst2['CFBundlePackageType'] = CFBundlePackageType vst2['CFBundleSignature'] = PLUG_UID vst2['CSResourcesFileMapped'] = CSResourcesFileMapped plistlib.writePlist(vst2, plistpath) replacestrs(plistpath, "//Apple//", "//Apple Computer//") # AUDIOUNIT plistpath = projectpath + "/resources/" + BUNDLE_NAME + "-AU-Info.plist" au = plistlib.readPlist(plistpath) au['AudioComponents'] = [{}] au['AudioUnit Version'] = PLUG_VER_STR au['CFBundleExecutable'] = BUNDLE_NAME au['CFBundleGetInfoString'] = CFBundleGetInfoString au['CFBundleIdentifier'] = "com." + BUNDLE_MFR + ".audiounit." + BUNDLE_NAME + "" au['CFBundleName'] = BUNDLE_NAME au['CFBundleVersion'] = CFBundleVersion au['CFBundleShortVersionString'] = CFBundleVersion au['LSMinimumSystemVersion'] = LSMinimumSystemVersion au['CFBundlePackageType'] = CFBundlePackageType au['CFBundleSignature'] = PLUG_UID au['CSResourcesFileMapped'] = CSResourcesFileMapped #Steinberg AU Wrapper stuff #Apple 10.7+ SDK stuff #https://developer.apple.com/library/mac/technotes/tn2276/_index.html if PLUG_IS_INST: COMP_TYPE = kAudioUnitType_MusicDevice elif PLUG_DOES_MIDI: COMP_TYPE = kAudioUnitType_MusicEffect else: COMP_TYPE = kAudioUnitType_Effect #if compiling against 10.6 sdk, delete AudioComponents key if (BASE_SDK == "macosx10.5") or (BASE_SDK == "macosx10.6"): print "Component manager entry point only" if (au['AudioComponents']): del au['AudioComponents'] else: print "AudioComponent and Component manager entry points" au['AudioComponents'] = [{}] au['AudioComponents'][0]['resourceUsage'] = {} au['AudioComponents'][0]['description'] = PLUG_NAME_STR au['AudioComponents'][0]['factoryFunction'] = PLUG_FACTORY au['AudioComponents'][0]['manufacturer'] = PLUG_MFR_UID au['AudioComponents'][0][ 'name'] = PLUG_MFR_NAME_STR + ": " + PLUG_NAME_STR au['AudioComponents'][0]['subtype'] = PLUG_UID au['AudioComponents'][0]['type'] = COMP_TYPE au['AudioComponents'][0]['version'] = PLUG_VER #Sandbox stuff # https://developer.apple.com/library/Mac/technotes/tn2247/_index.html au['AudioComponents'][0]['sandboxSafe'] = True #au['AudioComponents'][0]['resourceUsage']['temporary-exception.files.all.read-write'] = True plistlib.writePlist(au, plistpath) replacestrs(plistpath, "//Apple//", "//Apple Computer//") # AAX plistpath = projectpath + "/resources/" + BUNDLE_NAME + "-AAX-Info.plist" aax = plistlib.readPlist(plistpath) aax['CFBundleExecutable'] = BUNDLE_NAME aax['CFBundleGetInfoString'] = CFBundleGetInfoString aax['CFBundleIdentifier'] = "com." + BUNDLE_MFR + ".aax." + BUNDLE_NAME + "" aax['CFBundleName'] = BUNDLE_NAME aax['CFBundleVersion'] = CFBundleVersion aax['CFBundleShortVersionString'] = CFBundleVersion aax['LSMinimumSystemVersion'] = LSMinimumSystemVersion aax['CSResourcesFileMapped'] = CSResourcesFileMapped plistlib.writePlist(aax, plistpath) replacestrs(plistpath, "//Apple//", "//Apple Computer//") # RTAS plistpath = projectpath + "/resources/" + BUNDLE_NAME + "-RTAS-Info.plist" rtas = plistlib.readPlist(plistpath) rtas['CFBundleExecutable'] = BUNDLE_NAME rtas['CFBundleGetInfoString'] = CFBundleGetInfoString rtas[ 'CFBundleIdentifier'] = "com." + BUNDLE_MFR + ".rtas." + BUNDLE_NAME + "" rtas['CFBundleName'] = BUNDLE_NAME rtas['CFBundleVersion'] = CFBundleVersion rtas['CFBundleShortVersionString'] = CFBundleVersion rtas['LSMinimumSystemVersion'] = LSMinimumSystemVersion rtas['CSResourcesFileMapped'] = CSResourcesFileMapped plistlib.writePlist(rtas, plistpath) replacestrs(plistpath, "//Apple//", "//Apple Computer//") # APP plistpath = projectpath + "/resources/" + BUNDLE_NAME + "-OSXAPP-Info.plist" osxapp = plistlib.readPlist(plistpath) osxapp['CFBundleExecutable'] = BUNDLE_NAME osxapp['CFBundleGetInfoString'] = CFBundleGetInfoString osxapp[ 'CFBundleIdentifier'] = "com." + BUNDLE_MFR + ".standalone." + BUNDLE_NAME + "" osxapp['CFBundleName'] = BUNDLE_NAME osxapp['CFBundleVersion'] = CFBundleVersion osxapp['CFBundleShortVersionString'] = CFBundleVersion osxapp['LSMinimumSystemVersion'] = LSMinimumSystemVersion osxapp['CFBundlePackageType'] = CFBundlePackageType osxapp['CFBundleSignature'] = PLUG_UID osxapp['CSResourcesFileMapped'] = CSResourcesFileMapped osxapp['NSPrincipalClass'] = "SWELLApplication" osxapp['NSMainNibFile'] = "MainMenu" osxapp['LSApplicationCategoryType'] = "public.app-category.music" osxapp['CFBundleIconFile'] = BUNDLE_NAME + ".icns" plistlib.writePlist(osxapp, plistpath) replacestrs(plistpath, "//Apple//", "//Apple Computer//") print "Processing .exp symbol export file..."
def generate_genecords_tar(dbname, frame=None, outbox=None, GUI="n"): if GUI == "n": print("Generating genecords TAR archive") log("Generating genecords TAR archive") #Create genecords directory if nonexistent try: os.mkdir("genecords") except: pass #genecordslistdict = {} #Read gene coordinates input from genbank_mf_all/txt passedtags = [] genome_info = {} lasttag = "" for i in fileinput.input(dbname + "_all.txt"): if GUI == "y": frame.update() if len(i) > 0: i = i.replace(">","").replace("\n","") tabs = i.split("|") protein = tabs[3] genome = tabs[0] tag = genome[:5].upper() #If new genome is reached, save data in pickle file if tag != lasttag and not fileinput.isfirstline(): if GUI == "y": frame.update() #Load previous data if available if lasttag in passedtags: pickle_file = open("genecords" + os.sep + lasttag + ".pickle", "rb") previous_data = pickle.load(pickle_file) for key in previous_data: if key in genome_info: genome_info[key].extend(previous_data[key]) else: genome_info[key] = previous_data[key] else: passedtags.append(lasttag) save_to_pickle(lasttag, genome_info) genome_info = {} if genome in genome_info: genome_info[genome].append(i) else: genome_info[genome] = [i] lasttag = tag #Repeat data saving if lasttag in passedtags: pickle_file = open("genecords" + os.sep + lasttag + ".pickle", "rb") previous_data = pickle.load(pickle_file) for key in previous_data: if key in genome_info: genome_info[key].extend(previous_data[key]) else: genome_info[key] = previous_data[key] else: passedtags.append(lasttag) save_to_pickle(lasttag, genome_info) fileinput.close() #Sort dictionary by size #sortedgenecordskeylist = sortdictkeysbyvalues(genecordslistdict) #Archive directory as TAR file and remove original directory try: if GUI == "y": frame.update() tar = tarfile.open(dbname + ".cords.tar", "w") tar.add("genecords") tar.close() except: print("Could not create TAR file from genecords folder. Please create archive manually.") log("Could not create TAR file from genecords folder. Please create archive manually.", exit=True) if GUI == "y": frame.update() try: shutil.rmtree("genecords") except: pass
def main(args): global _stash ap = argparse.ArgumentParser() ap.add_argument('pattern', help='the pattern to match') ap.add_argument('files', nargs='*', help='files to be searched') ap.add_argument('-i', '--ignore-case', action='store_true', help='ignore case while searching') ap.add_argument('-v', '--invert', action='store_true', help='invert the search result') ap.add_argument('-c', '--count', action='store_true', help='count the search results instead of normal output') ns = ap.parse_args(args) flags = 0 if ns.ignore_case: flags |= re.IGNORECASE pattern = re.compile(ns.pattern, flags=flags) # Do not try to grep directories files = [f for f in ns.files if not os.path.isdir(f)] fileinput.close() # in case it is not closed try: counts = collections.defaultdict(int) for line in fileinput.input(files, openhook=fileinput.hook_encoded("utf-8")): if bool(pattern.search(line)) != ns.invert: if ns.count: counts[fileinput.filename()] += 1 else: if ns.invert: # optimize: if ns.invert, then no match, so no highlight color needed newline = line else: newline = re.sub( pattern, lambda m: _stash.text_color(m.group(), 'red'), line) if fileinput.isstdin(): fmt = u'{lineno}: {line}' else: fmt = u'{filename}: {lineno}: {line}' print( fmt.format(filename=fileinput.filename(), lineno=fileinput.filelineno(), line=newline.rstrip())) if ns.count: for filename, count in counts.items(): fmt = u'{count:6} {filename}' print(fmt.format(filename=filename, count=count)) except Exception as err: print("grep: {}: {!s}".format(type(err).__name__, err), file=sys.stderr) finally: fileinput.close()
def main(args, fout=sys.stdout): if args.debug: logger.setLevel(logging.DEBUG) conn = pd.read_table(args.db) conn = conn.loc[:, ['Transcript stable ID', 'Gene type', 'Transcript type' ]].drop_duplicates() conn = conn.set_index(['Transcript stable ID']) max_warnings = 10 w = 0 c = 0 n = 0 bad_chroms = set() for row in fileinput.input(args.annotation_file[0], openhook=fileinput.hook_compressed): n = n + 1 if fileinput.isfirstline() and (row.startswith("#bin") or \ row.startswith("bin")): logger.debug("Header detected in genePred file. Assuming UCSC" " format.") continue else: logger.debug("No header detected. Assuming custom genePred.") if row.startswith("#"): continue rowobj = Row(row) if not args.no_skip_random_chromosomes and \ rowobj.is_on_random_chromosome(): c = c + 1 continue if rowobj.chromosome_contains_underscore(): w = w + 1 if rowobj.chrom not in bad_chroms: logger.warning("Skipping chromosome %s because it contains" " underscores" % rowobj.chrom) bad_chroms.add(rowobj.chrom) continue # filter for only protein-coding genes try: result = conn.loc[get_stripped_name(rowobj.name)] if isinstance(result, pd.DataFrame): result = result.iloc[0, ] if not (result['Gene type'] == "protein_coding" and result['Transcript type'] == "protein_coding"): c = c + 1 continue except KeyError: c = c + 1 continue bed = rowobj.extract_last_exon() if bed is not None: fout.write("\t".join([str(x) for x in bed]) + "\n") else: c = c + 1 fileinput.close() if float(c) / float(n) > 0.75: logger.warning("%d/%d (%0.2f%%) were skipped. Are you using the " "correct database?" % (c, n, float(c) / float(n)))
def demonstrate_clusters(): from sklearn.cluster import KMeans from scipy import interpolate from matplotlib.ticker import MultipleLocator, FormatStrFormatter plist, X = [], [] for line in fileinput.input("../data/var/weekday.txt"): part = line.strip().split(" ") x, y, f = int(part[0]), int(part[1]), [float(i) for i in part[2:]] plist.append([x, y]) X.append(f) fileinput.close() k_means = KMeans(init='k-means++', n_clusters=3, n_init=10) k_means.fit(X) k_means.labels_ = k_means.labels_ k_means.cluster_centers_ = k_means.cluster_centers_ mesh = [[0 for j in xrange(rangey)] for i in xrange(rangex)] for i in xrange(len(k_means.labels_)): if k_means.labels_[i] == 0: mesh[plist[i][0]][plist[i][1]] = 1.5 if k_means.labels_[i] == 1: mesh[plist[i][0]][plist[i][1]] = 0.6 if k_means.labels_[i] == 2: mesh[plist[i][0]][plist[i][1]] = -1 fig = plt.figure() ax = fig.add_subplot(111) (X, Y), C = meshgrid(np.arange(100), np.arange(100)), np.array(mesh)[20:120, 20:120] pcolormesh(X, Y, C.T, cmap='RdBu', vmin=-2, vmax=2) plt.axis([0, 100 - 1, 0, 100 - 1]) plt.xlabel('Longitude grid index /200m') plt.ylabel('Latitude grid index /200m') # plt.show() for postfix in ('eps', 'png'): savefig('../figure/{0}/13.{0}'.format(postfix)) fig = plt.figure() ax1 = fig.add_subplot(111) for _cluster, linestyle, label in [(0, 'k-', "Cluster 1"), (1, 'k--', "Cluster 2"), (2, 'k:', "Cluster 3")]: x, y = [i for i in xrange(ranget)], k_means.cluster_centers_[_cluster] tck = interpolate.splrep(x, y, s=0) xnew = np.arange(0, 23, 0.1) ynew = interpolate.splev(xnew, tck, der=0) plt.plot(xnew, ynew, linestyle, label=label, linewidth=2) plt.plot([0, 23], [0, 0], 'k--') plt.xlim(0, 23) plt.ylim(-0.03, 0.03) plt.xlabel('Time /hour') plt.ylabel('Differentiate index') handles, labels = ax1.get_legend_handles_labels() ax1.legend(handles, labels) xmajorLocator = MultipleLocator(1) xmajorFormatter = FormatStrFormatter('%d') ax1.xaxis.set_major_locator(xmajorLocator) ax1.xaxis.set_major_formatter(xmajorFormatter) # show() for postfix in ('eps', 'png'): savefig('../figure/{0}/14.{0}'.format(postfix))
def headquarters(): positive = 0 negative = 0 not_found = 0 f_not_found = open("not_found.txt", "w") f_negative = open("negative.txt", "w") f_positive = open("positive.txt", "w") tuples_not_found = set() for t in results: # first, try a direct match org_extracted = t[0].decode("utf8").upper().strip() locations_groundtruth = ground_truth.get(org_extracted) # if its a direct match with a ground truth organization, compare the locations if locations_groundtruth: loc_extracted = t[1].decode("utf8").upper().strip() found = False for locations in locations_groundtruth: # some locations in DBpedia contain diferente references, e.g., city,state # e.g.,: AUBURN HILLS, MICHIGAN # split and compare with both # in case it was found and got outside the for-loop below # no need to check more references if found == True: break locations_parts = locations.split(",") for loc in locations_parts: # match locations with Jaro-Winkler, keep those >=0.8 similarity score score = jellyfish.jaro_winkler( loc_extracted.encode("utf8"), loc.strip().encode("utf8")) if score >= 0.8: f_positive.write(t[0] + '\t' + t[1] + '\n') positive += 1 found = True break # if ground-truth (from DBpedia) is a country, and extracted is a city # check if the city is in that country elif loc in countries: if loc_extracted.encode("utf8") in country_cities[loc]: f_positive.write(t[0] + '\t' + t[1] + '\t' + '\n') positive += 1 found = True break #TODO # if ground-truth (from DBpedia) is a city, and extracted location is a country # check if that city is located in that country only # elif if found == False: negative += 1 f_negative.write( t[0] + '\t' + t[1] + '\t\t:' + ';'.join(locations_groundtruth).encode("utf8") + '\n') else: tuples_not_found.add(t) # try to expand the acronyms names_found = set() for name in tuples_not_found: # if it is a single token with all uppercase letters if len(name[0].split()) == 1 and name[0].isupper(): found = False # get all the possible expansions that match this acronym expansions = acronyms.get(name[0]) if expansions: # check if any of these expansions is an organization in the # ground_truth database and if it is, extract the locations for e in expansions: locations_groundtruth = ground_truth.get(e.upper()) if locations_groundtruth: for location in locations_groundtruth: locations_parts = location.split(",") for loc in locations_parts: # approximate similarity score = jellyfish.jaro_winkler( loc.encode("utf8"), name[1].upper()) if score >= 0.8: #f_positive.write(name[0]+' ('+e+')\t'+name[1]+'\t'+str(avg_score)+'\n') f_positive.write(name[0] + ' (' + e + ')\t' + name[1] + '\n') positive += 1 found = True names_found.add(name) break if (found == True): break for n in names_found: tuples_not_found.remove(n) # for tuples not found query Freebase # cache of strings that were already queried to Freebase queried = [] for line in fileinput.input( '/home/dsbatista/gigaword/ground-truth/freebase-queried.txt'): queried.append(line.strip()) fileinput.close() # file to save Freebase query results output = codecs.open( '/home/dsbatista/gigaword/ground-truth/freebase-output.txt', 'a', "utf-8") # open file for append, update 'freebase-queried.txt' with new issue queries f_queried = open( '/home/dsbatista/gigaword/ground-truth/freebase-queried.txt', "a") tuples_found = [] for t in tuples_not_found: org = t[0].strip() # for now do not query acronyms to Freebase with ~=, too many false positives if not (len(t[0].split()) == 1 and name[0].isupper()): # first check if that query string was already issued to Freebase # if not, query Freebase and save the result if org not in queried: if org == "Star-Times": continue response = queryFreebase(org) queried.append(org) if response != 'error': try: if response['result']: print "found:\t", org parseResponse(org, response, output) else: print "not found:\t", org f_queried.write(org + '\n') f_queried.flush() except TypeError, e: print org print e print response f_queried.close() output.close() sys.exit(0) except Exception, e: print org print e print response f_queried.close() output.close() sys.exit(0)
def build_ino(arduino_location, libraries_location, ino_file, com_num): # =========================== # create some background data # these need to reflect the details of your system # where is the Arduino program arduinoIdeVersion = { "1.5.6-r2": arduino_location, "1.6.3": arduino_location } # where are libraries stored arduinoExtraLibraries = libraries_location # where this program will store stuff # these directories will be beside this Python program compileDirName = "ArduinoTemp" archiveDirName = "ArduinoUploadArchive" # default build options build_options = { "action": "upload", "board": "arduino:avr:uno", "port": "COM" + str(com_num), "ide": "1.5.6-r2" } # some other important variables - just here for easy reference archiveRequired = False usedLibs = [] hFiles = [] # ============================ # ensure directories exist # and empty the compile directory # first the directory used for compiling pythonDir = os.path.dirname(os.path.realpath(__file__)) compileDir = os.path.join(pythonDir, compileDirName) if not os.path.exists(compileDir): os.makedirs(compileDir) existingFiles = os.listdir(compileDir) for f in existingFiles: os.remove(os.path.join(compileDir, f)) # then the directory where the Archives are saved archiveDir = os.path.join(pythonDir, archiveDirName) if not os.path.exists(archiveDir): os.makedirs(archiveDir) # ============================= # get the .ino file and figure out the build options # the stuff in the .ino file will have this format # and will start at the first line in the file # // python-build-start # // python-build-end inoFileName = ino_file inoBaseName, inoExt = os.path.splitext(os.path.basename(inoFileName)) ''' kept for when dynamic parsing is added numLines = 1 # in case there is no end-line maxLines = 6 buildError = "" if inoExt.strip() == ".ino": codeFile = open(inoFileName, 'r') startLine = codeFile.readline()[3:].strip() if startLine == "python-build-start": nextLine = codeFile.readline()[3:].strip() while nextLine != "python-build-end": buildCmd = nextLine.split(',') if len(buildCmd) > 1: buildOptions[buildCmd[0].strip()] = buildCmd[1].strip() numLines += 1 if numLines >= maxLines: buildError = "No end line" break nextLine = codeFile.readline()[3:].strip() else: buildError = "No start line" else: buildError = "Not a .ino file" if len(buildError) > 0: print("Sorry, can't process file - %s" % buildError) ''' # print buid Options print("BUILD OPTIONS") for n, m in build_options.items(): print("%s %s" % (n, m)) # ============================= # get the program filename for the selected IDE arduinoProg = arduinoIdeVersion[build_options["ide"]] # ============================= # prepare archive stuff # # create name of directory to save the code = name-yyyymmdd-hhmmss # this will go inside the directory archiveDir inoArchiveDirName = inoBaseName + time.strftime("-%Y%m%d-%H:%M:%S") # note this directory will only be created if there is a successful upload # the name is figured out here to be written into the .ino file so it can be printed by the Arduino code # it will appear as char archiveDirName[] = "nnnnn"; # if the .ino file does not have a line with char archiveDirName[] then it will be assumed # that no archiving is required # check for existence of line for line in fileinput.input(inoFileName): if "char archiveDirName[]" in line: archiveRequired = True break fileinput.close() if archiveRequired: for line in fileinput.input(inoFileName, inplace=1): if "char archiveDirName[]" in line: print('char archiveDirName[] = "%s";' % inoArchiveDirName) else: print(line.rstrip()) fileinput.close() # ~ os.utime(inoFileName, None) # ============================= # figure out what libraries and .h files are used # if there are .h files they will need to be copied to ArduinoTemp # first get the list of all the extra libraries that exist extraLibList = os.listdir(arduinoExtraLibraries) # go through the .ino file to get any lines with #include includeLines = [] for line in fileinput.input(inoFileName): if "#include" in line: includeLines.append(line.strip()) fileinput.close() print("#INCLUDE LINES") print(includeLines) # now look for lines with < signifying libraries for n in includeLines: angleLine = n.split('<') if len(angleLine) > 1: lib_name = angleLine[1].split('>') lib_name = lib_name[0].split('.') lib_name = lib_name[0].strip() # add the name to usedLibs if it is in the extraLibList if lib_name in extraLibList: usedLibs.append(lib_name) print("LIBS TO BE ARCHIVED") print(usedLibs) # then look for lines with " signifiying a reference to a .h file # NB the name will be a full path name for n in includeLines: quoteLine = n.split('"') if len(quoteLine) > 1: hName = quoteLine[1].split('"') hName = hName[0].strip() # add the name to hFiles hFiles.append(hName) print(".h FILES TO BE ARCHIVED") print(hFiles) # ============================== # copy the .ino file to the directory compileDir and change its name to match the directory saveFile = os.path.join(compileDir, compileDirName + ".ino") shutil.copy(inoFileName, saveFile) # =============================== # generate the Arduino command arduino_command = "%s --%s --board %s --port %s %s" % ( arduinoProg, build_options["action"], build_options["board"], build_options["port"], saveFile) print("ARDUINO COMMAND") print(arduino_command) # =============================== # call the IDE print("STARTING ARDUINO -- %s\n" % (build_options["action"])) presult = subprocess.call([ arduinoProg, "--%s" % build_options["action"], "--board", build_options["board"], "--port", build_options["port"], saveFile ], shell=True) if presult != 0: raise SystemError("Error, wrong COM number") else: print("\nARDUINO SUCCESSFUL") # ================================ # after a successful upload we may need to archive the code if archiveRequired: print("\nARCHIVING") # create the Archive directory ar_dir = os.path.join(archiveDir, inoArchiveDirName) print(ar_dir) # this ought to be a unique name - hence no need to check for duplicates os.makedirs(ar_dir) # copy the code into the new directory shutil.copy(inoFileName, ar_dir) # copy the .h files to the new directory for n in hFiles: shutil.copy(n, ar_dir) # copy the used libraries to the new directory for n in usedLibs: lib_name = os.path.join(arduinoExtraLibraries, n) dest_dir = os.path.join(ar_dir, "libraries", n) shutil.copytree(lib_name, dest_dir) print("\nARCHIVING DONE")
def bs_single_end(main_read_file, asktag, adapter_file, cut1, cut2, no_small_lines, max_mismatch_no, aligner_command, db_path, tmp_path, outfile, XS_pct, XS_count, XSteve, adapter_mismatch, show_multiple_hit, show_unmapped_hit): logm("----------------------------------------------") logm("Read filename: %s" % main_read_file) logm("The first base (for mapping): %d" % cut1) logm("The last base (for mapping): %d" % cut2) logm("Path for short reads aligner: %s" % aligner_command + '\n') logm("Reference genome library path: %s" % db_path) if asktag == "Y": logm("Un-directional library") else: logm("Directional library") # end-of-if logm("Number of mismatches allowed: %s" % str(max_mismatch_no)) # adapter : strand-specific or not adapter = "" adapter_fw = "" adapter_rc = "" if adapter_file != "": try: adapter_inf = open(adapter_file, "r") if asktag == "N": #<--- directional library adapter = adapter_inf.readline() adapter_inf.close() adapter = adapter.rstrip("\n")[0:10] elif asktag == "Y": #<--- un-directional library adapter_fw = adapter_inf.readline() adapter_rc = adapter_inf.readline() adapter_inf.close() adapter_fw = adapter_fw.rstrip("\n")[0:10] adapter_rc = adapter_rc.rstrip("\n")[-10::] if adapter_rc == "": adapter_rc = reverse_compl_seq(adapter_fw) adapter_inf.close() except IOError: print "[Error] Cannot open adapter file : %s" % adapter_file exit(-1) if adapter_file != "": if asktag == "N": #<--- directional library logm("Adapter sequence: %s" % adapter) elif asktag == "Y": logm("3\' end adapter sequence: %s" % adapter_fw) logm("5\' end adapter sequence: %s" % adapter_rc) logm("-------------------------------- ") # helper method to join fname with tmp_path tmp_d = lambda fname: os.path.join(tmp_path, fname) db_d = lambda fname: os.path.join(db_path, fname) # splitting the big read file input_fname = os.path.split(main_read_file)[1] #---- Stats ------------------------------------------------------------ all_raw_reads = 0 all_trimmed = 0 all_mapped = 0 all_mapped_passed = 0 all_base_before_trim = 0 all_base_after_trim = 0 all_base_mapped = 0 numbers_premapped_lst = [0, 0, 0, 0] numbers_mapped_lst = [0, 0, 0, 0] mC_lst = [0, 0, 0] uC_lst = [0, 0, 0] no_my_files = 0 #---------------------------------------------------------------- if show_multiple_hit is not None: outf_MH = open(show_multiple_hit, 'w') if show_unmapped_hit is not None: outf_UH = open(show_unmapped_hit, 'w') for read_file in isplit_file(main_read_file, tmp_d(input_fname) + '-s-', no_small_lines): # for read_file in my_files: original_bs_reads = {} no_my_files += 1 random_id = ".tmp-" + str(random.randint(1000000, 9999999)) #------------------------------------------------------------------- # un-directional sequencing #------------------------------------------------------------------- if asktag == "Y": #---------------------------------------------------------------- outfile2 = tmp_d('Trimmed_C2T.fa' + random_id) outfile3 = tmp_d('Trimmed_G2A.fa' + random_id) outf2 = open(outfile2, 'w') outf3 = open(outfile3, 'w') #---------------------------------------------------------------- # detect format of input file try: if read_file.endswith( ".gz"): # support input file ending with ".gz" read_inf = gzip.open(read_file, "rb") else: read_inf = open(read_file, "r") except IOError: print "[Error] Cannot open input file : %s" % read_file exit(-1) logm("Start reading and trimming the input sequences") oneline = read_inf.readline() if oneline == "": oneline = "NNNN" l = oneline.split() input_format = "" if oneline[0] == "@": input_format = "fastq" elif len(l) == 1 and oneline[0] != ">": input_format = "seq" elif len(l) == 11: input_format = "qseq" elif oneline[0] == ">": input_format = "fasta" read_inf.close() #---------------------------------------------------------------- # read sequence, remove adapter and convert read_id = "" seq = "" seq_ready = "N" line_no = 0 fw_trimmed = 0 rc_trimmed = 0 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed ): # allow input with .gz if line == "": # fix bug for empty input line line = "NNNN" l = line.split() line_no += 1 if input_format == "seq": all_raw_reads += 1 read_id = str(all_raw_reads) read_id = read_id.zfill(12) seq = l[0] seq_ready = "Y" elif input_format == "fastq": l_fastq = math.fmod(line_no, 4) if l_fastq == 1: all_raw_reads += 1 read_id = l[0][1:] seq_ready = "N" elif l_fastq == 2: seq = l[0] seq_ready = "Y" else: seq = "" seq_ready = "N" elif input_format == "qseq": all_raw_reads += 1 read_id = str(all_raw_reads) read_id = read_id.zfill(12) seq = l[8] seq_ready = "Y" elif input_format == "fasta": l_fasta = math.fmod(line_no, 2) if l_fasta == 1: all_raw_reads += 1 read_id = l[0][1:] seq = "" seq_ready = "N" elif l_fasta == 0: seq = l[0] seq_ready = "Y" #---------------------------------------------------------------- if seq_ready == "Y": seq = seq[cut1 - 1:cut2] #<---- selecting 0..52 from 1..72 -e 52 seq = seq.upper() seq = seq.replace(".", "N") # striping BS adapter from 3' read all_base_before_trim += len(seq) if (adapter_fw != "") or (adapter_rc != ""): new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch) if len(new_read) < len(seq): fw_trimmed += 1 new_read_len = len(new_read) #print new_read new_read = Remove_5end_Adapter(new_read, adapter_rc, adapter_mismatch) new_read = RemoveAdapter(new_read, adapter_fw, adapter_mismatch) if len(new_read) < new_read_len: rc_trimmed += 1 #print new_read if len(new_read) < len(seq): all_trimmed += 1 seq = new_read all_base_after_trim += len(seq) if len(seq) <= 4: seq = ''.join(["N" for x in xrange(cut2 - cut1 + 1)]) #--------- trimmed_raw_BS_read ------------------ original_bs_reads[read_id] = seq #--------- FW_C2T ------------------ outf2.write('>%s\n%s\n' % (read_id, seq.replace("C", "T"))) #--------- RC_G2A ------------------ outf3.write('>%s\n%s\n' % (read_id, seq.replace("G", "A"))) fileinput.close() outf2.close() outf3.close() delete_files(read_file) logm("Reads trimmed from 3\' end : %d " % fw_trimmed) logm("Reads trimmed from 5\' end : %d " % rc_trimmed) #-------------------------------------------------------------------------------- # Bowtie mapping #------------------------------------------------------------------------------- logm("Start mapping") WC2T = tmp_d("W_C2T_m" + str(max_mismatch_no) + ".mapping" + random_id) CC2T = tmp_d("C_C2T_m" + str(max_mismatch_no) + ".mapping" + random_id) WG2A = tmp_d("W_G2A_m" + str(max_mismatch_no) + ".mapping" + random_id) CG2A = tmp_d("C_G2A_m" + str(max_mismatch_no) + ".mapping" + random_id) # print aligner_command % {'int_no_mismatches' : int_no_mismatches, # 'reference_genome' : os.path.join(db_path,'W_C2T'), # 'input_file' : outfile2, # 'output_file' : WC2T} run_in_parallel([ aligner_command % { 'reference_genome': os.path.join(db_path, 'W_C2T'), 'input_file': outfile2, 'output_file': WC2T }, aligner_command % { 'reference_genome': os.path.join(db_path, 'C_C2T'), 'input_file': outfile2, 'output_file': CC2T }, aligner_command % { 'reference_genome': os.path.join(db_path, 'W_G2A'), 'input_file': outfile3, 'output_file': WG2A }, aligner_command % { 'reference_genome': os.path.join(db_path, 'C_G2A'), 'input_file': outfile3, 'output_file': CG2A } ]) delete_files(outfile2, outfile3) #-------------------------------------------------------------------------------- # Post processing #-------------------------------------------------------------------------------- FW_C2T_U, FW_C2T_R = extract_mapping(WC2T) RC_G2A_U, RC_G2A_R = extract_mapping(CG2A) FW_G2A_U, FW_G2A_R = extract_mapping(WG2A) RC_C2T_U, RC_C2T_R = extract_mapping(CC2T) #---------------------------------------------------------------- # get unique-hit reads #---------------------------------------------------------------- Union_set = set(FW_C2T_U.iterkeys()) | set( RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set( RC_C2T_U.iterkeys()) Unique_FW_C2T = set() # + Unique_RC_G2A = set() # + Unique_FW_G2A = set() # - Unique_RC_C2T = set() # - Multiple_hits = set() for x in Union_set: _list = [] for d in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]: mis_lst = d.get(x, [99]) mis = int(mis_lst[0]) _list.append(mis) for d in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]: mis = d.get(x, 99) _list.append(mis) mini = min(_list) if _list.count(mini) == 1: mini_index = _list.index(mini) if mini_index == 0: Unique_FW_C2T.add(x) elif mini_index == 1: Unique_RC_G2A.add(x) elif mini_index == 2: Unique_FW_G2A.add(x) elif mini_index == 3: Unique_RC_C2T.add(x) # if mini_index = 4,5,6,7, indicating multiple hits else: Multiple_hits.add(x) else: Multiple_hits.add(x) # write reads rejected by Multiple Hits to file if show_multiple_hit is not None: #outf_MH=open(show_multiple_hit,'w') for i in Multiple_hits: outf_MH.write(">%s\n" % i) outf_MH.write("%s\n" % original_bs_reads[i]) #outf_MH.close() # write unmapped reads to file if show_unmapped_hit is not None: #outf_UH=open(show_unmapped_hit,'w') for i in original_bs_reads: if i not in Union_set: outf_UH.write(">%s\n" % i) outf_UH.write("%s\n" % original_bs_reads[i]) #outf_UH.close() del Union_set del FW_C2T_R del FW_G2A_R del RC_C2T_R del RC_G2A_R FW_C2T_uniq_lst = [[FW_C2T_U[u][1], u] for u in Unique_FW_C2T] FW_G2A_uniq_lst = [[FW_G2A_U[u][1], u] for u in Unique_FW_G2A] RC_C2T_uniq_lst = [[RC_C2T_U[u][1], u] for u in Unique_RC_C2T] RC_G2A_uniq_lst = [[RC_G2A_U[u][1], u] for u in Unique_RC_G2A] FW_C2T_uniq_lst.sort() RC_C2T_uniq_lst.sort() FW_G2A_uniq_lst.sort() RC_G2A_uniq_lst.sort() FW_C2T_uniq_lst = [x[1] for x in FW_C2T_uniq_lst] RC_C2T_uniq_lst = [x[1] for x in RC_C2T_uniq_lst] FW_G2A_uniq_lst = [x[1] for x in FW_G2A_uniq_lst] RC_G2A_uniq_lst = [x[1] for x in RC_G2A_uniq_lst] #---------------------------------------------------------------- numbers_premapped_lst[0] += len(Unique_FW_C2T) numbers_premapped_lst[1] += len(Unique_RC_G2A) numbers_premapped_lst[2] += len(Unique_FW_G2A) numbers_premapped_lst[3] += len(Unique_RC_C2T) del Unique_FW_C2T del Unique_FW_G2A del Unique_RC_C2T del Unique_RC_G2A #---------------------------------------------------------------- nn = 0 gseq = dict() chr_length = dict() for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst, FW_C2T_U), (RC_G2A_uniq_lst, RC_G2A_U), (FW_G2A_uniq_lst, FW_G2A_U), (RC_C2T_uniq_lst, RC_C2T_U)]: nn += 1 for header in ali_unique_lst: _, mapped_chr, mapped_location, cigar = ali_dic[header] original_BS = original_bs_reads[header] #------------------------------------- if mapped_chr not in gseq: gseq[mapped_chr] = deserialize(db_d(mapped_chr)) chr_length[mapped_chr] = len(gseq[mapped_chr]) if nn == 2 or nn == 3: cigar = list(reversed(cigar)) r_start, r_end, g_len = get_read_start_end_and_genome_length( cigar) all_mapped += 1 if nn == 1: # +FW mapped to + strand: FR = "+FW" mapped_strand = "+" elif nn == 2: # +RC mapped to + strand: FR = "+RC" # RC reads from -RC reflecting the methylation status on Watson strand (+) mapped_location = chr_length[ mapped_chr] - mapped_location - g_len mapped_strand = "+" original_BS = reverse_compl_seq( original_BS) # for RC reads elif nn == 3: # -RC mapped to - strand: mapped_strand = "-" FR = "-RC" # RC reads from +RC reflecting the methylation status on Crick strand (-) original_BS = reverse_compl_seq( original_BS) # for RC reads elif nn == 4: # -FW mapped to - strand: mapped_strand = "-" FR = "-FW" mapped_location = chr_length[ mapped_chr] - mapped_location - g_len origin_genome, next, output_genome = get_genomic_sequence( gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand) r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) if len(r_aln) == len(g_aln): N_mismatch = N_MIS(r_aln, g_aln) # if N_mismatch <= int(max_mismatch_no): mm_no = float(max_mismatch_no) if (mm_no >= 1 and N_mismatch <= mm_no) or ( mm_no < 1 and N_mismatch <= (mm_no * len(r_aln))): numbers_mapped_lst[nn - 1] += 1 all_mapped_passed += 1 methy = methy_seq(r_aln, g_aln + next) mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) #---XS FILTER---------------- XS = 0 if XSteve: if ('ZZZ' in methy.translate(None, "-XY")): XS = 1 # else: nCH = methy.count('y') + methy.count('z') nmCH = methy.count('Y') + methy.count('Z') if ((nmCH > XS_count) and nmCH / float(nCH + nmCH) > XS_pct): XS = 1 # # outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome=output_genome) all_base_mapped += len(original_BS) #---------------------------------------------------------------- logm("--> %s (%d) " % (read_file, no_my_files)) delete_files(WC2T, WG2A, CC2T, CG2A) #-------------------------------------------------------------------- # directional sequencing #-------------------------------------------------------------------- if asktag == "N": #---------------------------------------------------------------- outfile2 = tmp_d('Trimmed_C2T.fa' + random_id) outf2 = open(outfile2, 'w') #---------------------------------------------------------------- try: if read_file.endswith( ".gz"): # support input file ending with ".gz" read_inf = gzip.open(read_file, "rb") else: read_inf = open(read_file, "r") except IOError: print "[Error] Cannot open input file : %s" % read_file exit(-1) logm("Start reading and trimming the input sequences") oneline = read_inf.readline() if oneline == "": oneline = "NNNN" l = oneline.split() input_format = "" if oneline[0] == "@": input_format = "fastq" elif len(l) == 1 and oneline[0] != ">": input_format = "seq" elif len(l) == 11: input_format = "qseq" elif oneline[0] == ">": input_format = "fasta" read_inf.close() #print "detected data format: %s"%(input_format) #---------------------------------------------------------------- read_id = "" seq = "" seq_ready = "N" line_no = 0 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed): if l == "": l = "NNNN" l = line.split() line_no += 1 if input_format == "seq": all_raw_reads += 1 read_id = str(all_raw_reads) read_id = read_id.zfill(12) seq = l[0] seq_ready = "Y" elif input_format == "fastq": l_fastq = math.fmod(line_no, 4) if l_fastq == 1: all_raw_reads += 1 read_id = l[0][1:] seq_ready = "N" elif l_fastq == 2: seq = l[0] seq_ready = "Y" else: seq = "" seq_ready = "N" elif input_format == "qseq": all_raw_reads += 1 read_id = str(all_raw_reads) read_id = read_id.zfill(12) seq = l[8] seq_ready = "Y" elif input_format == "fasta": l_fasta = math.fmod(line_no, 2) if l_fasta == 1: all_raw_reads += 1 read_id = l[0][1:] seq = "" seq_ready = "N" elif l_fasta == 0: seq = l[0] seq_ready = "Y" #-------------------------------- if seq_ready == "Y": seq = seq[cut1 - 1:cut2] #<---selecting 0..52 from 1..72 -e 52 seq = seq.upper() seq = seq.replace(".", "N") #--striping adapter from 3' read ------- all_base_before_trim += len(seq) if adapter != "": new_read = RemoveAdapter(seq, adapter, adapter_mismatch) if len(new_read) < len(seq): all_trimmed += 1 seq = new_read all_base_after_trim += len(seq) if len(seq) <= 4: seq = "N" * (cut2 - cut1 + 1) #--------- trimmed_raw_BS_read ------------------ original_bs_reads[read_id] = seq #--------- FW_C2T ------------------ outf2.write('>%s\n%s\n' % (read_id, seq.replace("C", "T"))) fileinput.close() outf2.close() delete_files(read_file) #-------------------------------------------------------------------------------- # Bowtie mapping #-------------------------------------------------------------------------------- logm("Start mapping") WC2T = tmp_d("W_C2T_m" + str(max_mismatch_no) + ".mapping" + random_id) CC2T = tmp_d("C_C2T_m" + str(max_mismatch_no) + ".mapping" + random_id) run_in_parallel([ aligner_command % { 'reference_genome': os.path.join(db_path, 'W_C2T'), 'input_file': outfile2, 'output_file': WC2T }, aligner_command % { 'reference_genome': os.path.join(db_path, 'C_C2T'), 'input_file': outfile2, 'output_file': CC2T } ]) delete_files(outfile2) #-------------------------------------------------------------------------------- # Post processing #-------------------------------------------------------------------------------- FW_C2T_U, FW_C2T_R = extract_mapping(WC2T) RC_C2T_U, RC_C2T_R = extract_mapping(CC2T) #---------------------------------------------------------------- # get uniq-hit reads #---------------------------------------------------------------- Union_set = set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys()) Unique_FW_C2T = set() # + Unique_RC_C2T = set() # - Multiple_hits = set() # write reads rejected by Multiple Hits to file for x in Union_set: _list = [] for d in [FW_C2T_U, RC_C2T_U]: mis_lst = d.get(x, [99]) mis = int(mis_lst[0]) _list.append(mis) for d in [FW_C2T_R, RC_C2T_R]: mis = d.get(x, 99) _list.append(mis) mini = min(_list) #print _list if _list.count(mini) == 1: mini_index = _list.index(mini) if mini_index == 0: Unique_FW_C2T.add(x) elif mini_index == 1: Unique_RC_C2T.add(x) else: Multiple_hits.add(x) else: Multiple_hits.add(x) # write reads rejected by Multiple Hits to file if show_multiple_hit is not None: #outf_MH=open(show_multiple_hit,'w') for i in Multiple_hits: outf_MH.write(">%s\n" % i) outf_MH.write("%s\n" % original_bs_reads[i]) #outf_MH.close() # write unmapped reads to file if show_unmapped_hit is not None: #outf_UH=open(show_unmapped_hit,'w') for i in original_bs_reads: if i not in Union_set: outf_UH.write(">%s\n" % i) outf_UH.write("%s\n" % original_bs_reads[i]) #outf_UH.close() FW_C2T_uniq_lst = [[FW_C2T_U[u][1], u] for u in Unique_FW_C2T] RC_C2T_uniq_lst = [[RC_C2T_U[u][1], u] for u in Unique_RC_C2T] FW_C2T_uniq_lst.sort() RC_C2T_uniq_lst.sort() FW_C2T_uniq_lst = [x[1] for x in FW_C2T_uniq_lst] RC_C2T_uniq_lst = [x[1] for x in RC_C2T_uniq_lst] #---------------------------------------------------------------- numbers_premapped_lst[0] += len(Unique_FW_C2T) numbers_premapped_lst[1] += len(Unique_RC_C2T) #---------------------------------------------------------------- nn = 0 gseq = dict() chr_length = dict() for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst, FW_C2T_U), (RC_C2T_uniq_lst, RC_C2T_U)]: nn += 1 for header in ali_unique_lst: _, mapped_chr, mapped_location, cigar = ali_dic[header] original_BS = original_bs_reads[header] #------------------------------------- if mapped_chr not in gseq: gseq[mapped_chr] = deserialize(db_d(mapped_chr)) chr_length[mapped_chr] = len(gseq[mapped_chr]) r_start, r_end, g_len = get_read_start_end_and_genome_length( cigar) all_mapped += 1 if nn == 1: # +FW mapped to + strand: FR = "+FW" mapped_strand = "+" elif nn == 2: # -FW mapped to - strand: mapped_strand = "-" FR = "-FW" mapped_location = chr_length[ mapped_chr] - mapped_location - g_len origin_genome, next, output_genome = get_genomic_sequence( gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand) r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) if len(r_aln) == len(g_aln): N_mismatch = N_MIS( r_aln, g_aln ) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides mm_no = float(max_mismatch_no) if (mm_no >= 1 and N_mismatch <= mm_no) or ( mm_no < 1 and N_mismatch <= (mm_no * len(r_aln))): numbers_mapped_lst[nn - 1] += 1 all_mapped_passed += 1 methy = methy_seq(r_aln, g_aln + next) mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) #---XS FILTER---------------- XS = 0 if XSteve: if ('ZZZ' in methy.translate(None, "-XY")): XS = 1 # else: nCH = methy.count('y') + methy.count('z') nmCH = methy.count('Y') + methy.count('Z') if ((nmCH > XS_count) and nmCH / float(nCH + nmCH) > XS_pct): XS = 1 # # outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome=output_genome) all_base_mapped += len(original_BS) #---------------------------------------------------------------- logm("--> %s (%d) " % (read_file, no_my_files)) delete_files(WC2T, CC2T) #---------------------------------------------------------------- delete_files(tmp_path) if show_multiple_hit is not None: outf_MH.close() if show_unmapped_hit is not None: outf_UH.close() logm("----------------------------------------------") logm("Number of raw reads: %d" % all_raw_reads) if all_raw_reads > 0: logm("Number of bases in total: %d " % all_base_before_trim) if (asktag == "N" and adapter != "") or (asktag == "Y" and adapter_fw != ""): logm("Number of reads having adapter removed: %d" % all_trimmed) trim_percent = ( float(all_base_after_trim) / all_base_before_trim) if all_base_before_trim > 0 else 0 logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, trim_percent)) # logm("Number of reads are rejected because of multiple hits: %d" % len(Multiple_hits)) logm("Number of unique-hits reads (before post-filtering): %d" % all_mapped) if asktag == "Y": logm( " %7d FW reads mapped to Watson strand (before post-filtering)" % (numbers_premapped_lst[0])) logm( " %7d RC reads mapped to Watson strand (before post-filtering)" % (numbers_premapped_lst[1])) logm( " %7d FW reads mapped to Crick strand (before post-filtering)" % (numbers_premapped_lst[2])) logm( " %7d RC reads mapped to Crick strand (before post-filtering)" % (numbers_premapped_lst[3])) elif asktag == "N": logm( " %7d FW reads mapped to Watson strand (before post-filtering)" % (numbers_premapped_lst[0])) logm( " %7d FW reads mapped to Crick strand (before post-filtering)" % (numbers_premapped_lst[1])) logm("Post-filtering %d uniquely aligned reads with mismatches <= %s" % (all_mapped_passed, max_mismatch_no)) if asktag == "Y": logm(" %7d FW reads mapped to Watson strand" % (numbers_mapped_lst[0])) logm(" %7d RC reads mapped to Watson strand" % (numbers_mapped_lst[1])) logm(" %7d FW reads mapped to Crick strand" % (numbers_mapped_lst[2])) logm(" %7d RC reads mapped to Crick strand" % (numbers_mapped_lst[3])) elif asktag == "N": logm(" %7d FW reads mapped to Watson strand" % (numbers_mapped_lst[0])) logm(" %7d FW reads mapped to Crick strand" % (numbers_mapped_lst[1])) Mappability = (100 * float(all_mapped_passed) / all_raw_reads) if all_raw_reads > 0 else 0 logm("Mappability = %1.4f%%" % Mappability) logm("Total bases of uniquely mapped reads : %7d" % all_base_mapped) # n_CG = mC_lst[0] + uC_lst[0] n_CHG = mC_lst[1] + uC_lst[1] n_CHH = mC_lst[2] + uC_lst[2] # logm("----------------------------------------------") logm("Methylated C in mapped reads ") # logm(" mCG %1.3f%%" % ((100 * float(mC_lst[0]) / n_CG) if n_CG != 0 else 0)) logm(" mCHG %1.3f%%" % ((100 * float(mC_lst[1]) / n_CHG) if n_CHG != 0 else 0)) logm(" mCHH %1.3f%%" % ((100 * float(mC_lst[2]) / n_CHH) if n_CHH != 0 else 0)) # logm("----------------------------------------------") logm("File : %s" % main_read_file) elapsed("Resource / CPU time") logm("------------------- END --------------------") close_log()
def ignoreORGs(data): for line in fileinput.input(data): ORGS_to_ignore.append(line.strip()) fileinput.close()
def rec_cnt(fqfn): for rec in fileinput.input(fqfn): pass rec_cnt = fileinput.lineno() fileinput.close() return rec_cnt
def write_after_line(line_search, new_text, filepath): for line in fileinput.input(filepath, inplace=True): print(line.rstrip("\n")) if line_search in line.strip(): print(new_text) fileinput.close()
def _replace_in_file(filename, old, new): """ Replaces old with new in file filename. """ for line in fileinput.FileInput(filename, inplace=1): line = line.replace(old, new) print(line, end='') fileinput.close()
def main(): parser = OptionParser(usage="usage: %prog [options] json_file1 json_file2 ...") parser.add_option("-t", "--top", action="store", type="int", dest="top", help="number of top pairs to display", default=10) parser.add_option("-o", action="store", type="string", dest="out_path", help="output path", default="hashtag-cooccurrences.csv") (options, args) = parser.parse_args() if( len(args) < 1 ): parser.error( "Must specify at least one JSONL file" ) log.basicConfig(level=20, format='%(message)s') # Count pairs of hashtags in the same tweet pair_counts = defaultdict(int) for tweets_path in args: log.info("Loading tweets from %s ..." % tweets_path) # Process every line as JSON data hashtags = {} num_tweets, num_failed, line_number = 0, 0, 0 num_multiple = 0 for l in fileinput.input(tweets_path): l = l.strip() if len(l) == 0: continue try: line_number += 1 tweet = json.loads(l) tweet_tags = set() # find the tags if "entities" in tweet: if "hashtags" in tweet["entities"] and len(tweet["entities"]["hashtags"]) > 0: for tag in tweet["entities"]["hashtags"]: tweet_tags.add( "#" + tag["text"].lower().strip() ) # do not count duplicates tweet_tags = list(tweet_tags) # process the pairs if len(tweet_tags) > 1: num_multiple += 1 for p in itertools.combinations(tweet_tags, 2): if p[0] < p[1]: pair = frozenset( [p[0],p[1]] ) else: pair = frozenset( [p[1],p[0]] ) pair_counts[pair] += 1 num_tweets += 1 if line_number % 50000 == 0: log.info("Processed %d lines" % line_number) except Exception as e: log.error("Failed to parse tweet on line %d: %s" % ( line_number, e ) ) num_failed += 1 fileinput.close() log.info("Processed %d tweets from file" % num_tweets ) log.info("%d/%d tweets in file contained more than one hashtag" % ( num_multiple, num_tweets ) ) log.info("Total of %d unique pairs of hashtags" % len(pair_counts) ) # Output pairs log.info("Writing pairs to %s ..." % options.out_path ) fout = codecs.open( options.out_path, "w", encoding="utf-8", errors="ignore" ) fout.write("Hashtag1\tHastag2\tCount\n") for p in pair_counts: pair = list(p) pair.sort() fout.write( "%s\t%s\t%d\n" % ( pair[0], pair[1], pair_counts[p] ) ) fout.close() # Display top counts sx = sorted(pair_counts.items(), key=operator.itemgetter(1), reverse=True) log.info("Top %d co-occurring hashtag pairs:" % min( len(sx), options.top ) ) tab = PrettyTable( ["Hashtag1", "Hashtag2", "Count"] ) tab.align["Hashtag1"] = "l" tab.align["Hashtag2"] = "l" tab.align["Count"] = "r" for i, p in enumerate(sx): if i > options.top: break pair = list(p[0]) pair.sort() tab.add_row( [pair[0], pair[1], p[1]] ) log.info(tab)
def func2(): with open("../data/feature/trace_all_statistic_filter","w") as f: for line in fileinput.input("../data/feature/trace_all_statistic"): part = line.strip().split(" ") mac, sex, tot, feat = part[0], {"男性":0, "女性":1}[part[1]], part[2], part[3:] array_0, array_p, array_1, array_2, array_3, array_4, array_5, array_6, array_7, array_8, array_9, array_10 = {}, {}, [], [], [], [], [], [], [], [], [], [] for one in feat: objs = one.split("@") chars, ints = objs[0].split("+"), objs[1].split(",") if len(chars) == 1: if chars[0] in ["WD","WE","A","B","C","D","0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23"]: array_0[chars[0]] = ints[1]+","+str(int(float(ints[2])*1000))+","+str(int(float(ints[3])*1000))+","+ints[4]+","+ints[5]+","+ints[6]+","+ints[7]+","+str(int(float(ints[8])*1000)) else: array_p[chars[0]] = str(int(float(ints[2])*1000))+","+str(int(float(ints[3])*1000))+","+ints[4]+","+ints[5]+","+ints[6]+","+ints[7]+","+str(int(float(ints[8])*1000)) if len(chars) == 2 and chars[0] in ["WD","WE"] and chars[1] in ["A","B","C","D"]: array_0["+".join(chars)] = str(int(float(ints[2])*1000))+","+str(int(float(ints[3])*1000))+","+ints[4]+","+ints[5]+","+ints[6]+","+ints[7]+","+str(int(float(ints[8])*1000)) if len(chars) == 2 and chars[0] in ["WD","WE"] and chars[1] in ["0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23"]: array_0["+".join(chars)] = str(int(float(ints[2])*1000))+","+str(int(float(ints[3])*1000))+","+ints[4]+","+ints[5]+","+ints[6]+","+ints[7]+","+str(int(float(ints[8])*1000)) if len(chars) == 2 and chars[1] in ["WD","WE"]: if chars[0] in ["Acad","Adm","Ath","Cant","Hosp","Lib","Soc","Supp","Teach","Other"]: array_1.append({'k':"+".join(chars),'cnt':int(float(ints[2])*1000),'sum':int(float(ints[3])*1000),'min':int(ints[4]),'max':int(ints[5]),'avg':int(ints[6]),'med':int(ints[7]),'std':int(float(ints[8])*1000)}) else: array_2.append({'k':"+".join(chars),'cnt':int(float(ints[2])*1000),'sum':int(float(ints[3])*1000),'min':int(ints[4]),'max':int(ints[5]),'avg':int(ints[6]),'med':int(ints[7]),'std':int(float(ints[8])*1000)}) if len(chars) == 2 and chars[1] in ["A","B","C","D"]: if chars[0] in ["Acad","Adm","Ath","Cant","Hosp","Lib","Soc","Supp","Teach","Other"]: array_3.append({'k':"+".join(chars),'cnt':int(float(ints[2])*1000),'sum':int(float(ints[3])*1000),'min':int(ints[4]),'max':int(ints[5]),'avg':int(ints[6]),'med':int(ints[7]),'std':int(float(ints[8])*1000)}) else: array_4.append({'k':"+".join(chars),'cnt':int(float(ints[2])*1000),'sum':int(float(ints[3])*1000),'min':int(ints[4]),'max':int(ints[5]),'avg':int(ints[6]),'med':int(ints[7]),'std':int(float(ints[8])*1000)}) if len(chars) == 2 and chars[1] in ["0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23"]: if chars[0] in ["Acad","Adm","Ath","Cant","Hosp","Lib","Soc","Supp","Teach","Other"]: array_5.append({'k':"+".join(chars),'cnt':int(float(ints[2])*1000),'sum':int(float(ints[3])*1000),'min':int(ints[4]),'max':int(ints[5]),'avg':int(ints[6]),'med':int(ints[7]),'std':int(float(ints[8])*1000)}) else: array_6.append({'k':"+".join(chars),'cnt':int(float(ints[2])*1000),'sum':int(float(ints[3])*1000),'min':int(ints[4]),'max':int(ints[5]),'avg':int(ints[6]),'med':int(ints[7]),'std':int(float(ints[8])*1000)}) if len(chars) == 3 and chars[1] in ["WD","WE"] and chars[2] in ["A","B","C","D"]: if chars[0] in ["Acad","Adm","Ath","Cant","Hosp","Lib","Soc","Supp","Teach","Other"]: array_7.append({'k':"+".join(chars),'cnt':int(float(ints[2])*1000),'sum':int(float(ints[3])*1000),'min':int(ints[4]),'max':int(ints[5]),'avg':int(ints[6]),'med':int(ints[7]),'std':int(float(ints[8])*1000)}) else: array_8.append({'k':"+".join(chars),'cnt':int(float(ints[2])*1000),'sum':int(float(ints[3])*1000),'min':int(ints[4]),'max':int(ints[5]),'avg':int(ints[6]),'med':int(ints[7]),'std':int(float(ints[8])*1000)}) if len(chars) == 3 and chars[1] in ["WD","WE"] and chars[2] in ["0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23"]: if chars[0] in ["Acad","Adm","Ath","Cant","Hosp","Lib","Soc","Supp","Teach","Other"]: array_9.append({'k':"+".join(chars),'cnt':int(float(ints[2])*1000),'sum':int(float(ints[3])*1000),'min':int(ints[4]),'max':int(ints[5]),'avg':int(ints[6]),'med':int(ints[7]),'std':int(float(ints[8])*1000)}) else: array_10.append({'k':"+".join(chars),'cnt':int(float(ints[2])*1000),'sum':int(float(ints[3])*1000),'min':int(ints[4]),'max':int(ints[5]),'avg':int(ints[6]),'med':int(ints[7]),'std':int(float(ints[8])*1000)}) array_1_cnt, array_1_avg = sorted(array_1, key=lambda k: k['cnt'], reverse = True)[0:10], sorted(array_1, key=lambda k: k['avg'], reverse = True)[0:10] for tmp in array_1_cnt: if not tmp in array_1_avg: array_1_avg.append(tmp) array_2_cnt, array_2_avg = sorted(array_2, key=lambda k: k['cnt'], reverse = True)[0:30], sorted(array_2, key=lambda k: k['avg'], reverse = True)[0:30] for tmp in array_2_cnt: if not tmp in array_2_avg: array_2_avg.append(tmp) array_3_cnt, array_3_avg = sorted(array_3, key=lambda k: k['cnt'], reverse = True)[0:10], sorted(array_3, key=lambda k: k['avg'], reverse = True)[0:10] for tmp in array_3_cnt: if not tmp in array_3_avg: array_3_avg.append(tmp) array_4_cnt, array_4_avg = sorted(array_4, key=lambda k: k['cnt'], reverse = True)[0:30], sorted(array_4, key=lambda k: k['avg'], reverse = True)[0:30] for tmp in array_4_cnt: if not tmp in array_4_avg: array_4_avg.append(tmp) array_5_cnt, array_5_avg = sorted(array_5, key=lambda k: k['cnt'], reverse = True)[0:10], sorted(array_5, key=lambda k: k['avg'], reverse = True)[0:10] for tmp in array_5_cnt: if not tmp in array_5_avg: array_5_avg.append(tmp) array_6_cnt, array_6_avg = sorted(array_6, key=lambda k: k['cnt'], reverse = True)[0:30], sorted(array_6, key=lambda k: k['avg'], reverse = True)[0:30] for tmp in array_6_cnt: if not tmp in array_6_avg: array_6_avg.append(tmp) array_7_cnt, array_7_avg = sorted(array_7, key=lambda k: k['cnt'], reverse = True)[0:10], sorted(array_7, key=lambda k: k['avg'], reverse = True)[0:10] for tmp in array_7_cnt: if not tmp in array_7_avg: array_7_avg.append(tmp) array_8_cnt, array_8_avg = sorted(array_8, key=lambda k: k['cnt'], reverse = True)[0:30], sorted(array_8, key=lambda k: k['avg'], reverse = True)[0:30] for tmp in array_8_cnt: if not tmp in array_8_avg: array_8_avg.append(tmp) array_9_cnt, array_9_avg = sorted(array_9, key=lambda k: k['cnt'], reverse = True)[0:10], sorted(array_9, key=lambda k: k['avg'], reverse = True)[0:10] for tmp in array_9_cnt: if not tmp in array_9_avg: array_9_avg.append(tmp) array_10_cnt, array_10_avg = sorted(array_10, key=lambda k: k['cnt'], reverse = True)[0:30], sorted(array_10, key=lambda k: k['avg'], reverse = True)[0:30] for tmp in array_10_cnt: if not tmp in array_10_avg: array_10_avg.append(tmp) array = [] for key in ["WD","WE","A","B","C","D","0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23"]: if array_0.has_key(key): array.append(key+"@"+array_0[key]) else: array.append(key+"@0,0,0,0,0,0,0,0") for key1 in ["WD","WE"]: for key2 in ["A","B","C","D"]: key = key1+"+"+key2 if array_0.has_key(key): array.append(key+"@"+array_0[key]) else: array.append(key+"@0,0,0,0,0,0,0") for key1 in ["WD","WE"]: for key2 in ["0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23"]: key = key1+"+"+key2 if array_0.has_key(key): array.append(key+"@"+array_0[key]) else: array.append(key+"@0,0,0,0,0,0,0") for k,v in array_p.iteritems(): array.append(k+'@'+v) for one in array_1_avg: array.append(one['k']+"@"+str(one['cnt'])+","+str(one['sum'])+","+str(one['min'])+","+str(one['max'])+","+str(one['avg'])+","+str(one['med'])+","+str(one['std'])) for one in array_2_avg: array.append(one['k']+"@"+str(one['cnt'])+","+str(one['sum'])+","+str(one['min'])+","+str(one['max'])+","+str(one['avg'])+","+str(one['med'])+","+str(one['std'])) for one in array_3_avg: array.append(one['k']+"@"+str(one['cnt'])+","+str(one['sum'])+","+str(one['min'])+","+str(one['max'])+","+str(one['avg'])+","+str(one['med'])+","+str(one['std'])) for one in array_4_avg: array.append(one['k']+"@"+str(one['cnt'])+","+str(one['sum'])+","+str(one['min'])+","+str(one['max'])+","+str(one['avg'])+","+str(one['med'])+","+str(one['std'])) for one in array_5_avg: array.append(one['k']+"@"+str(one['cnt'])+","+str(one['sum'])+","+str(one['min'])+","+str(one['max'])+","+str(one['avg'])+","+str(one['med'])+","+str(one['std'])) for one in array_6_avg: array.append(one['k']+"@"+str(one['cnt'])+","+str(one['sum'])+","+str(one['min'])+","+str(one['max'])+","+str(one['avg'])+","+str(one['med'])+","+str(one['std'])) for one in array_7_avg: array.append(one['k']+"@"+str(one['cnt'])+","+str(one['sum'])+","+str(one['min'])+","+str(one['max'])+","+str(one['avg'])+","+str(one['med'])+","+str(one['std'])) for one in array_8_avg: array.append(one['k']+"@"+str(one['cnt'])+","+str(one['sum'])+","+str(one['min'])+","+str(one['max'])+","+str(one['avg'])+","+str(one['med'])+","+str(one['std'])) for one in array_9_avg: array.append(one['k']+"@"+str(one['cnt'])+","+str(one['sum'])+","+str(one['min'])+","+str(one['max'])+","+str(one['avg'])+","+str(one['med'])+","+str(one['std'])) for one in array_10_avg: array.append(one['k']+"@"+str(one['cnt'])+","+str(one['sum'])+","+str(one['min'])+","+str(one['max'])+","+str(one['avg'])+","+str(one['med'])+","+str(one['std'])) f.write(mac+' '+str(sex)+' '+tot+' '+" ".join(array)+'\n') fileinput.close()
def load_file(fn: str) -> List[str]: out_recs = [] for rec in fileinput.input(fn): out_recs.append(rec) fileinput.close() return out_recs
def func1(): class Stats: def __init__(self, sequence): self.sequence = [int(item) for item in sequence] def sum(self): if len(self.sequence) < 1: return None else: return sum(self.sequence) def cnt(self): return len(self.sequence) def min(self): if len(self.sequence) < 1: return None else: return min(self.sequence) def max(self): if len(self.sequence) < 1: return None else: return max(self.sequence) def avg(self): if len(self.sequence) < 1: return None else: return sum(self.sequence) / len(self.sequence) def med(self): if len(self.sequence) < 1: return None else: self.sequence.sort() return self.sequence[len(self.sequence) // 2] def std(self): if len(self.sequence) < 1: return None else: avg = self.avg() sdsq = sum([(i - avg) ** 2 for i in self.sequence]) stdev = (sdsq / (len(self.sequence) - 1)) ** .5 return stdev mapping, glob = {}, {} for line in gzip.open("../data/feature/trace_all.gz"): part = line.strip().split(" ") mac, dy, wd, ss, hr, cl, bd, dr = part[0].replace(":",""), part[1], part[2], part[3], part[4], part[5], part[6], int(part[8]) wd = {'1':'WD','2':'WE'}[wd] ss = {'1':'A','2':'B','3':'C','4':'D'}[ss] if not mapping.has_key(mac): mapping[mac] = {} if not glob.has_key(mac): glob[mac] = {} if not glob[mac].has_key(dy): glob[mac][dy] = 0 glob[mac][dy] = glob[mac][dy]+dr array = [wd, ss, hr, cl, bd, wd+'+'+ss, wd+'+'+hr,\ cl+'+'+wd, cl+'+'+ss, cl+'+'+hr, cl+'+'+wd+'+'+ss, cl+'+'+wd+'+'+hr,\ bd+'+'+wd, bd+'+'+ss, bd+'+'+hr, bd+'+'+wd+'+'+ss, bd+'+'+wd+'+'+hr] for one in array: if not mapping[mac].has_key(one): mapping[mac][one] = {} if not mapping[mac][one].has_key(dy): mapping[mac][one][dy] = [] if dr>0: mapping[mac][one][dy].append(dr) jac = {} for line in fileinput.input("../data/jaccount/jaccount_taged"): part = line.strip().split(" ") dev, mac, sex = part[0], part[1], part[2] if dev == "mobile": jac[mac] = {'sex':sex} fileinput.close() with open('../data/feature/trace_all_statistic', 'w') as f: for k,v in mapping.iteritems(): if jac.has_key(k): array = [] for x,y in glob[k].iteritems(): array.append(y) if len(array) >= 2: stats = Stats(array) _cnt, _sum, _min, _max, _avg, _med, _std = stats.cnt(), stats.sum(), stats.min(), stats.max(), stats.avg(), stats.med(), stats.std() f.write(k+' '+jac[k]['sex']+' tot@'+str(_cnt)+','+str(_sum)+','+str(_min)+','+str(_max)+','+str(_avg)+','+str(_med)+','+str(int(_std))) for p,q in v.iteritems(): array = [] for x,y in q.iteritems(): array.append(sum(y)) if len(array) >= 2: stats = Stats(array) f.write(' '+p+'@'+str(stats.cnt())+','+str(stats.sum())+',%.4f,'%(float(stats.cnt())/_cnt)+'%.4f,'%(float(stats.sum())/_sum)+str(stats.min())+','+str(stats.max())+','+str(stats.avg())+','+str(stats.med())+','+str(int(stats.std()))) f.write('\n')
def edit_ossec_conf(): rule_elements = [] # Ignore elements (legacy elements and default elements) ignore_elements = [ '<!--', '<decoder>etc/decoder.xml', '<decoder>etc/local_decoder.xml', '<decoder_dir>etc/decoders', '<decoder_dir>etc/ossec_decoders', '<decoder_dir>etc/wazuh_decoders', '<include>local_rules.xml' ] # Template template_file = open("{0}/rules.template".format(source_rules_path), 'r') include_template = template_file.readlines() include_template = include_template[ 3:-2] # Remove 3 first lines and 2 last lines template_file.close() # Remove "<rules>*</rules>" and "...ossec_config> <!-- rules global entry -->" inside_rules = False for line in fileinput.input(ossec_conf, inplace=True): if '<rules>' in line.strip(): inside_rules = True continue # Remove line elif '</rules>' in line.strip(): inside_rules = False continue # Remove line elif 'rules global entry' in line.strip(): continue # Remove line else: if inside_rules: if any(ignore_element in line.strip() for ignore_element in ignore_elements): continue else: rule_elements.append(line) # Save rule element else: print(line.rstrip("\n")) # Keep line fileinput.close() # Custom items in <rules> custom_decoder = [] custom_include = [] custom_list = [] custom_rule_dir = [] for rule_element in rule_elements: if '<decoder' in rule_element: custom_decoder.append(rule_element) elif '<list>' in rule_element: custom_list.append(rule_element) elif '<rule_dir>' in rule_element: custom_rule_dir.append(rule_element) elif '<include>' in rule_element: m = search('<include>(.+_rules.xml)', rule_element) if m: rule = m.group(1) if any(rule in include_r for include_r in include_template): continue else: custom_include.append(rule_element) # Write file with open(ossec_conf, "a") as conf_file: conf_file.write("<ossec_config> <!-- rules global entry -->\n") conf_file.write(" <rules>\n") conf_file.write(" <decoder_dir>etc/decoders</decoder_dir>\n") for c_d in custom_decoder: conf_file.write(c_d) conf_file.write(" <decoder>etc/local_decoder.xml</decoder>\n") for c_l in custom_list: conf_file.write(c_l) for i_t in include_template: conf_file.write(i_t) for c_r in custom_rule_dir: conf_file.write(c_r) for c_i in custom_include: conf_file.write(c_i) conf_file.write(" <include>local_rules.xml</include>\n") conf_file.write(" </rules>\n") conf_file.write("</ossec_config> <!-- rules global entry -->\n") os.chown(ossec_conf, root_uid, ossec_gid)