def __init__(self, species, download=False, withEns=True): self.species = species self.dbName = None self.data = Collector(self.species) self.data.collectAll(download=download, withEns=withEns) self.TranscriptNoProteinRec = {} self.DomainsSourceDB = 'DB_merged.sqlite' self.DomainOrg = DomainOrganizer()
class Pusher: def __init__ (self, config): self.c1 = Collector(config.DB) self.c2 = Collector(config.DB2) self.s1 = SheetItf(config.GOOGLE['credentials'], config.GOOGLE['scope'], config.GOOGLE['key'], config.GOOGLE['sheet']) def push(self): self.s1.addToRow([str(time.strftime("%d/%m/%Y")), str(time.strftime("%H:%M:%S"))]) self.s1.addToRow(self.c1.collect()) self.s1.addToRow(self.c2.collect()) print("pushing:", self.s1.nextRow) self.s1.pushRow()
def collect_stock(): collector = Collector() try: collector.collect_daily_data() except: try: collector.collect_daily_data() except: collector.collect_daily_data()
def runTest(self): collector = Collector(self.tmpCacheDir, serverHost='127.0.0.1', serverPort='8000', serverProtocol='http', serverUser=testAuthCreds[0], serverPass=testAuthCreds[1], clientId='test-fuzzer1') config = ProgramConfiguration("mozilla-central", "x86-64", "linux", version="ba0bc4f26681") crashInfo = CrashInfo.fromRawCrashData([], asanTraceCrash.splitlines(), config) # TODO: This is only a rudimentary check to see if we submitted *something*. # We should check more precisely that the information submitted is correct. issueCount = self.getRemoteCrashEntryCount() collector.submit(crashInfo, exampleTestCase) self.assertEqual(self.getRemoteCrashEntryCount(), issueCount + 1)
def collect_data(mode): video = videoCapture() collector = Collector(mode) directory = collector.getDirectory() frame = video.getMainFrame() ROI_NAME = 'Region of Interest' print(color("Data collecting started...", Colors.yellow)) print(color("video windows is opening ", Colors.yellow)) collector.generateFolders() while video.isRunning(): video.update() # Add Texts and Frames video.addRightText(mode) for gesture in gestures: video.addRightText(gestures[gesture] + ': ' + str(filesCounter(directory + str(gesture))[0])) video.addFrame(ROI_NAME, ROIcoordinates(frame)[0], ROIcoordinates(frame)[1], ROIcoordinates(frame)[2], ROIcoordinates(frame)[3]) video.display() interrupt = cv2.waitKey(10) ROI = video.getFrameRegion(ROI_NAME) collector.keyPressToImage(ROI, interrupt) if interrupt & 0xFF == 27: video.stop() video.releaseCamera() cv2.destroyAllWindows() print(color("Data collecting finished", Colors.yellow))
def work(self, login, password, proxy=None, first=False): if (not first): # If we aren't first, sleep a bit and let the first one create database time.sleep(1) print("Spawning thread {}, proxy {}".format(login, proxy)) # Move here to cope with psycopg restriction on being unable to use same session in different processes from Collector import Collector collector = Collector(login, password, proxy) worker = VkWorker(collector)
def runTest(self): collector = Collector(self.tmpCacheDir, serverHost='127.0.0.1', serverPort='8000', serverProtocol='http', serverUser=testAuthCreds[0], serverPass=testAuthCreds[1], clientId='test-fuzzer1') collector.refresh() receivedSignatures = False for sigFile in os.listdir(self.tmpCacheDir): receivedSignatures = True CrashSignature.fromFile(os.path.join(self.tmpCacheDir, sigFile)) if not receivedSignatures: self.skipTest("Server did not provide signatures")
def __init__(self, startTime = 0, *args, **kwargs): super().__init__(*args, **kwargs) self.sharedObjects = {} self.startTime = startTime self.xcontext = XValueContext(lambda: self.now() + self.startTime) self.t = self.xcontext.t self.collector = Collector() self.tcounter = 0 self.initialize() self.xvalues = {} self.logging = True
def __init__(self, binsAmount): self.gridWidth = 20 self.gridHeight = 9 self.fieldSize = 64 self.window = Tk() self.canvas = Canvas(self.window, width=self.fieldSize * self.gridWidth, height=self.fieldSize * self.gridHeight) self.binsAmount = binsAmount self.window.title("Simulation") self.collector = Collector(1, 1, 1) self.positionsToVisit = [] self.mapElements = [] self.addDumps() self.addRoads() self.addBins() self.addGrass() self.MovementLogic = MovementLogic(self.mapElements, self.gridWidth, self.gridHeight) self.classifier = GarbageClassifier("learningExamples.txt")
def run(self): self._collector_ready = False self._res_list = [] collector_ref = Collector.start(self._num_executors, self) for idx in range(self._num_executors): self._executors[idx].tell({'type': 'run', 'collector': collector_ref}) while not self._collector_ready: time.sleep(0.1) collector_ref.stop() return self._res_list
def collect_data(outfilename, measurement_time=default_time) : collector = Collector(port,isTCP=isTCP); time.sleep(1); outfile_encoder = open(outfilename+'_encoder.dat','w'); outfile_encoder.write('#TIME ERROR DIRECTION TIMERCOUNT REFERENCE\n'); outfile_encoder.flush() outfile_irig = open(outfilename+'_irig.dat','w'); outfile_irig.write('#TIMERCOUNT YEAR DAY HOUR MINUTE SECOND\n'); outfile_irig.flush() outfile_timeout = open(outfilename+'_timeout.dat','w'); outfile_timeout.write('#TIME TYPE\n'); outfile_timeout.flush() outfile_error = open(outfilename+'_error.dat','w'); outfile_error.write('#TIME ERRORCODE\n'); outfile_error.flush() encoder_header = 0x1EAF; irig_header = 0xCAFE; timeout_header = 0x1234; error_header = 0xE12A; encoder_extractor = EncoderExtractor(encoder_header); encoder_bytesize = encoder_extractor.pi.total_bytesize; irig_extractor = IrigExtractor(irig_header); irig_bytesize = irig_extractor.pi.total_bytesize; timeout_extractor = TimeoutExtractor(timeout_header); timeout_bytesize = timeout_extractor.pi.total_bytesize; error_extractor = ErrorExtractor(error_header); error_bytesize = error_extractor.pi.total_bytesize; header_unpack_str = encoder_extractor.pi.header_str; header_size = encoder_extractor.pi.header_num; header_bytesize = encoder_extractor.pi.header_bytesize; start_time = time.time() while True : encoder_frames = []; irig_frames = []; timeout_frames = []; error_frames = []; # Empty the queue and parse its contents appropriately approx_size = collector.queue.qsize() if approx_size>0 and verbose>0 : print('approximate size = {}'.format(approx_size)); for i in range(approx_size): # Block=True : Block execution until there is something in the queue to retrieve # timeout=None : the get() command will try indefinitely data = collector.queue.get(block=True, timeout=None); # Once data is extracted from the queue, parse its contents # and loop until data is empty data_len = len(data); if verbose>0 : print('obtained data size = {}'.format(data_len)); pass; parse_index = 0; while parse_index < data_len: if verbose>0 : print('parse_index = {} / data_len = {}'.format(parse_index, data_len)); # Extract header header = data[parse_index : parse_index + header_bytesize]; if verbose>0 : if header!=0 : print('obtained header (size) = {} ({})'.format(header,len(header))); elif verbose>1 : print('obtained header (size) = {} ({})'.format(header,len(header))); pass; # unpack from binary ( byte order: little endian(<), format : L (unsigned long) ) header = struct.unpack(("%s%s" % (endian, header_unpack_str)), header)[0] # Check for Encoder packet if header == encoder_header: if verbose>0 : print(' header == encoder'); encoder_frames.append(encoder_extractor.extract(data, parse_index)); parse_index += encoder_bytesize elif header == irig_header: if verbose>0 : print(' header == irig'); irig_frames.append(irig_extractor.extract(data, parse_index)); parse_index += irig_bytesize elif header == timeout_header: if verbose>0 : print(' header == timeout'); timeout_frames.append(timeout_extractor.extract(data, parse_index)); parse_index += timeout_bytesize elif header == error_header: if verbose>0 : print(' header == error'); error_frames.append(error_extractor.extract(data, parse_index)); parse_index += error_bytesize elif header == 0: if verbose>1 : print(' header == 0'); parse_index += header_bytesize #break; else: try : raise RuntimeError(("Bad header! This is not encoder/irig/timeout/error header! : %s" % (str(header)))) except RuntimeError as e: print(e); if verbose>0 : print('###get data###'); print(data); print('##############'); pass; #sys.exit(-1); break; pass; pass; pass; # end of ``while parse_index < data_len:`` # Reset data string data = '' pass; # end of loop over ``i`` currenttime = (int)(time.time()); # write encoder data for frame in encoder_frames : ncount = len(frame['timercount']); for i in range(ncount) : outfile_encoder.write('{} {} {} {} {}\n'.format(currenttime, 1-frame['error'][i],frame['quad'][i],frame['timercount'][i],frame['position'][i])); pass; pass; # write irig data for frame in irig_frames : outfile_irig.write('{} {} {} {} {} {}\n'.format(frame['timercount'],frame['year'],frame['day'],frame['hour'],frame['minute'],frame['second'])); pass; # write timeout data for frame in timeout_frames : outfile_timeout.write('{} {}\n'.format(currenttime, frame['type'])); pass; # write error data for frame in error_frames : outfile_error.write('{} {}\n'.format(currenttime, frame['error'])); pass; # flush output outfile_encoder.flush(); outfile_irig.flush(); outfile_timeout.flush(); outfile_error.flush(); stop_time = time.time() if(stop_time - start_time >= measurement_time): print(f'{measurement_time} sec have passed and stopped UDP') break pass; # end of ``while True :`` collector.stop(); outfile_encoder.close(); outfile_irig.close(); outfile_timeout.close(); outfile_error.close(); return 0;
def doExperiment(self, index): collector = Collector() path = os.getcwd() trainPath = os.path.join(path, 'train') testPath = os.path.join(path, 'test') if index == 2: print('you choose stopWord experiment') print('please wait...') name = 'stopWord.txt' fileName = os.path.join(path, name) f = open(fileName, 'r', encoding='gb18030') line = f.read() f.close() collector.stopWord += line.split('\n') resultPath = os.path.join(path, 'stopWord') modelName = 'demo-model-exp2.txt' resultName = 'demo-result-exp2.txt' self.doExpClassify(trainPath, testPath, resultPath, collector, modelName, resultName) elif index == 3: print('you choose word length experiment') print('please wait...') collector.removeWordLength[0] = 2 collector.removeWordLength[1] = 9 resultPath = os.path.join(path, 'wordLength') modelName = 'demo-model-exp3.txt' resultName = 'demo-result-exp3.txt' self.doExpClassify(trainPath, testPath, resultPath, collector, modelName, resultName) elif index == 4: print('you choose infrequent experiment') choose = input( 'please choose value(0) or percentage(1),default is value') if choose == '1': i = input( 'please enter the percentage(%)(5%-25%),default is 5%: ') i = int(i) if i < 5 or i > 25: i = 0.05 else: i = float(i / 100) else: i = input('please enter the value(1-20), defalut is 1: ') i = int(i) if i < 1 or i > 20: i = 1 print('please wait...') resultPath = os.path.join(path, 'infrequent') curCollector = Collector() modelName = 'demo-model-exp4.txt' resultName = 'demo-result-exp4.txt' curCollector.dataCollector(trainPath) curCollector.infrequentProcess(i) curCollector.buildModel(resultPath, modelName) curCollector.doClassify(testPath, resultPath, resultName) elif index == 5: print('you choose delta change experiment') i = input( 'please choose the value of delta(0.1-1), default is 0.5: ') print('please wait...') if float(i) < 0.1 or float(i) > 1: i = 0.5 newDelta = float(i) resultPath = os.path.join(path, 'delta') curCollector = Collector() curCollector.delta = newDelta modelName = 'demo-model-exp5.txt' resultName = 'demo-result-exp5.txt' self.doExpClassify(trainPath, testPath, resultPath, curCollector, modelName, resultName) else: print('you choose baseLine experiment') print('please wait...') resultPath = os.path.join(path, 'baseLine') modelName = 'demo-model-base.txt' resultName = 'demo-result-base.txt' self.doExpClassify(trainPath, testPath, resultPath, collector, modelName, resultName)
# Each score represents level of confidence for each of the objects. detection_scores = inference.get_model_detection_scores() detection_classes = inference.get_model_detection_classes() # Number of objects detected. num_detections = inference.get_model_detected_objects() camera = PiCamera() camera.resolution = (IM_WIDTH, IM_HEIGHT) camera.framerate = 10 rawCapture = PiRGBArray(camera, size=(IM_WIDTH, IM_HEIGHT)) rawCapture.truncate(0) # Initialize frame rate calculation collector = Collector() frame_rate_calc = collector.frame_rate_calc freq = collector.freq font = collector.font # Initialize publisher service. publisher = Publisher() for frame1 in camera.capture_continuous(rawCapture, format="bgr", use_video_port=True): t1 = cv2.getTickCount() # Acquire frame and expand frame dimensions to have shape: # [1, None, None, 3]
from datetime import datetime df = pd.read_csv("dataset.csv") temp = open("precips.txt") skip = len(temp.readlines()) print(skip) for idx, row in enumerate(df.iterrows()): if idx < skip: continue # if idx == 0: # break row = row[1] lat = str(row.BEGIN_LAT) lon = str(row.BEGIN_LON) date = row.BEGIN_DATE parts = date.split("/") date = parts[0]+"/" + parts[2] + "/" + parts[1] c = Collector(lat, lon, date) file = open("precips.txt", "a") data = c.getData() if data[0] == 'no' or data[1] == 'no': print("API RAN OUT: . Didn't get", idx) break if idx % 50 == 0: now = datetime.now() current_time = now.strftime("%H:%M:%S") print(idx, current_time, " || ", date, lat, lon, data) file.write(str(row.EVENT_ID) + ";;" + data[0] + ";;" + data[1] + "|||\n") # file.close()
def __init__ (self, config): self.c1 = Collector(config.DB) self.c2 = Collector(config.DB2) self.s1 = SheetItf(config.GOOGLE['credentials'], config.GOOGLE['scope'], config.GOOGLE['key'], config.GOOGLE['sheet'])
# Get utils module import utils # Setup environment variables import Process process = Process.getInstance(open('trading-data/.env', 'r')) program = sys.argv[1] if process.env['MASTER_SWITCH'] == 'on': if program == 'collect' and len(sys.argv) > 2: target_date = utils.parseISODate(sys.argv[2]) # kick off data collection sequence from Collector import Collector collector = Collector(target_date, False) collector.start() elif program == 'collect' and len(sys.argv) <= 2: from Collector import Collector collector = Collector(datetime.now(), True) collector.start() elif program == 'train': from Learner import Learner learner = Learner(False) learner.start() elif program == 'predict': from Learner import Learner learner = Learner(True, False) learner.start() else: print 'Invalid parameters given. Shutting down...'
import configparser import os from Database.DBManager import DBManager from Collector import Collector SOURCE_PATH = os.path.dirname(os.path.abspath(__file__)) CONFIG_FILE_NAME = SOURCE_PATH + "/settings.env" print(CONFIG_FILE_NAME) # Starting point config = configparser.ConfigParser() config.read(CONFIG_FILE_NAME) config_raw = configparser.RawConfigParser() config_raw.read(CONFIG_FILE_NAME) TIME_FORMAT = config_raw.get("Common", "TIME_FORMAT") DB_HOST = config.get("Database", "DB_HOST") DB_DATABASE = config.get("Database", "DB_DATABASE") DB_USERNAME = config.get("Database", "DB_USERNAME") DB_PASSWORD = config.get("Database", "DB_PASSWORD") db_manager = DBManager(DB_HOST, DB_USERNAME, DB_PASSWORD, DB_DATABASE) Collector.collect(db_manager, TIME_FORMAT)
class dbBuilder: def __init__(self, species, download=False, withEns=True): self.species = species self.dbName = None self.data = Collector(self.species) self.data.collectAll(download=download, withEns=withEns) self.TranscriptNoProteinRec = {} self.DomainsSourceDB = 'DB_merged.sqlite' self.DomainOrg = DomainOrganizer(download=download) def create_tables_db(self, merged=True, dbName=None): """ Create a transcripts table in the specie database and fills with ucsc transcripts data """ if dbName is not None: self.dbName = dbName elif merged: self.dbName = 'DB_merged' else: self.dbName = 'DB_' + self.species print("Creating database: {}...".format(self.dbName)) with connect(self.dbName + '.sqlite') as con: cur = con.cursor() cur.executescript('DROP TABLE IF EXISTS Genes;') print('Creating the table: Genes') cur.execute(''' CREATE TABLE Genes( gene_GeneID_id TEXT UNIQUE, gene_ensembl_id TEXT UNIQUE, gene_symbol TEXT, synonyms TEXT, chromosome TEXT, strand TEXT, specie TEXT, PRIMARY KEY(gene_GeneID_id, gene_ensembl_id, gene_symbol) );''' ) cur.executescript("DROP TABLE IF EXISTS transcripts;") print('Creating the table: Transcripts') cur.execute(''' CREATE TABLE Transcripts( transcript_refseq_id TEXT UNIQUE, transcript_ensembl_id TEXT UNIQUE, tx_start INTEGER, tx_end INTEGER, cds_start INTEGER, cds_end INTEGER, exon_count INTEGER, gene_GeneID_id TEXT, gene_ensembl_id TEXT, protein_refseq_id TEXT, protein_ensembl_id TEXT, PRIMARY KEY (transcript_refseq_id, transcript_ensembl_id), FOREIGN KEY(gene_GeneID_id, gene_ensembl_id) REFERENCES Genes(gene_GeneID_id, gene_ensembl_id), FOREIGN KEY(protein_refseq_id,protein_ensembl_id) REFERENCES Proteins(protein_refseq_id,protein_ensembl_id) );''' ) cur.executescript("DROP TABLE IF EXISTS Exons;") print('Creating the table: Exons') cur.execute(''' CREATE TABLE Exons( gene_GeneID_id TEXT, gene_ensembl_id TEXT, genomic_start_tx INTEGER, genomic_end_tx INTEGER, PRIMARY KEY (gene_GeneID_id, gene_ensembl_id, genomic_start_tx, genomic_end_tx), FOREIGN KEY(gene_GeneID_id, gene_ensembl_id) REFERENCES Genes(gene_GeneID_id, gene_ensembl_id) );''' ) cur.executescript("DROP TABLE IF EXISTS Transcript_Exon;") print('Creating the table: Transcript_Exon') cur.execute(''' CREATE TABLE Transcript_Exon( transcript_refseq_id TEXT, transcript_ensembl_id TEXT, order_in_transcript INTEGER, genomic_start_tx INTEGER, genomic_end_tx INTEGER, abs_start_CDS INTEGER, abs_end_CDS INTEGER, PRIMARY KEY(transcript_refseq_id, transcript_ensembl_id, order_in_transcript), FOREIGN KEY(transcript_refseq_id, transcript_ensembl_id) REFERENCES Transcripts(transcript_refseq_id, transcript_ensembl_id), FOREIGN KEY(genomic_start_tx, genomic_end_tx)\ REFERENCES Exons(genomic_start_tx, genomic_end_tx) );''' ) cur.executescript("DROP TABLE IF EXISTS Proteins;") print('Creating the table: Proteins') cur.execute(''' CREATE TABLE Proteins( protein_refseq_id TEXT UNIQUE, protein_ensembl_id TEXT UNIQUE, description TEXT, synonyms TEXT, length INTEGER, transcript_refseq_id TEXT, transcript_ensembl_id TEXT, PRIMARY KEY(protein_refseq_id, protein_ensembl_id), FOREIGN KEY(transcript_refseq_id, transcript_ensembl_id) REFERENCES Transcripts(transcript_refseq_id, transcript_ensembl_id) );''' ) cur.executescript("DROP TABLE IF EXISTS DomainType;") print('Creating the table: DomainType') cur.execute(''' CREATE TABLE DomainType( type_id INTEGER NOT NULL PRIMARY KEY UNIQUE, name TEXT, other_name TEXT, description TEXT, CDD_id TEXT, cdd TEXT, pfam TEXT, smart TEXT, tigr TEXT, interpro TEXT );''' ) cur.executescript("DROP TABLE IF EXISTS DomainEvent;") print('Creating the table: DomainEvent') cur.execute(''' CREATE TABLE DomainEvent( protein_refseq_id TEXT, protein_ensembl_id TEXT, type_id INTEGER, AA_start INTEGER, AA_end INTEGER, nuc_start INTEGER, nuc_end INTEGER, total_length INTEGER, ext_id TEXT, splice_junction BOOLEAN, complete_exon BOOLEAN, PRIMARY KEY(protein_refseq_id, protein_ensembl_id, type_id, AA_start, total_length), FOREIGN KEY(type_id) REFERENCES DomainType(type_id), FOREIGN KEY(protein_refseq_id, protein_ensembl_id) REFERENCES Proteins(protein_refseq_id, protein_ensembl_id) );''' ) cur.executescript("DROP TABLE IF EXISTS SpliceInDomains;") print('Creating the table: SpliceInDomains') cur.execute(""" CREATE TABLE SpliceInDomains( transcript_refseq_id TEXT, transcript_ensembl_id TEXT, exon_order_in_transcript INTEGER, type_id INTEGER, total_length INTEGER, domain_nuc_start INTEGER, included_len INTEGER, exon_num_in_domain INTEGER, PRIMARY KEY (transcript_refseq_id, transcript_ensembl_id, exon_order_in_transcript, type_id,\ total_length, domain_nuc_start), FOREIGN KEY(transcript_refseq_id, transcript_ensembl_id) REFERENCES Transcripts(transcript_refseq_id, transcript_ensembl_id), FOREIGN KEY(exon_order_in_transcript) REFERENCES Transcript_Exon(order_in_transcript), FOREIGN KEY(type_id) REFERENCES DomainType(type_id), FOREIGN KEY(domain_nuc_start, total_length) REFERENCES DomainEvent(Nuc_start, total_length) );""" ) if merged: cur.executescript("DROP TABLE IF EXISTS Orthology;") print('Creating the table: Orthology') cur.execute(""" CREATE TABLE Orthology( A_ensembl_id TEXT, A_GeneSymb TEXT, A_species TEXT, B_ensembl_id TEXT, B_GeneSymb TEXT, B_species TEXT, PRIMARY KEY (A_ensembl_id, B_ensembl_id), FOREIGN KEY (A_ensembl_id, B_ensembl_id, A_GeneSymb, B_GeneSymb, A_species, B_species) REFERENCES Genes(gene_ensembl_id, gene_ensembl_id, gene_symbol, gene_symbol, specie, specie) );""" ) # ~~~ disconnect database ~~~ def create_index(self): """ Creates index for for efficient searches""" with connect(self.dbName + '.sqlite') as con: cur = con.cursor() cur.execute('''CREATE INDEX geneTableIndexBySpecies ON Genes(specie);''') cur.execute('''CREATE INDEX transcriptTableIndexByGene ON Transcripts(gene_GeneID_id) ;''') cur.execute( '''CREATE INDEX exonsInTranscriptsTableIndexByTranscripts ON Transcript_Exon(transcript_refseq_id) ;''') cur.execute('''CREATE INDEX domainEventsTableIndexByProtein ON DomainEvent(protein_refseq_id) ;''') cur.execute('''CREATE INDEX domainEventsTableIndexByEnsembl ON DomainEvent(protein_ensembl_id);''') cur.execute( '''CREATE INDEX exonInTranscriptsTableIndexByEnsembl ON Transcript_Exon(transcript_ensembl_id);''') def fill_in_db(self, CollectDomainsFromMerged=True, merged=True, dbName=None): """ This is filling the database with the collected data for a single species. if used db is "merged" than set True to the param. if False than a species unique db will be created. """ if dbName is not None: self.dbName = dbName elif merged: self.dbName = 'DB_merged' else: self.dbName = 'DB_' + self.species if CollectDomainsFromMerged: # to keep domain ids consistent between the merged & single species db self.DomainOrg.collectDatafromDB(self.DomainsSourceDB) preDomains = set(self.DomainOrg.allDomains.keys()) with connect(self.dbName + '.sqlite') as con: print("Connected to " + self.dbName + "...") print("Filling in the tables...") cur = con.cursor() geneSet = set() uExon = set() relevantDomains = set() for tID, transcript in self.data.Transcripts.items(): ensemblkey = False if tID.startswith("ENS"): ensemblkey = True e_counts = len(transcript.exon_starts) # insert into Transcripts table if transcript.CDS is None: print("Transcript {} from {} has None in CDS".format(tID, self.species)) transcript.CDS = transcript.tx values = (transcript.refseq, transcript.ensembl,) + transcript.tx + transcript.CDS + \ (e_counts, transcript.gene_GeneID, transcript.gene_ensembl, transcript.protein_refseq, transcript.protein_ensembl,) cur.execute('''INSERT INTO Transcripts (transcript_refseq_id, transcript_ensembl_id, tx_start, tx_end, cds_start,\ cds_end, exon_count, gene_GeneID_id, gene_ensembl_id, protein_refseq_id, protein_ensembl_id) VALUES(?,?,?,?,?,?,?,?,?,?,?)''', values) # insert into Genes table if transcript.gene_GeneID not in geneSet and \ transcript.gene_ensembl not in geneSet: gene = self.data.Genes.get( transcript.gene_GeneID if transcript.gene_GeneID is not None else transcript.gene_ensembl, self.data.Genes.get(transcript.gene_ensembl, None)) if gene is None: raise ValueError("No gene in Genes for transcript {}, {}. GeneID: {}, ensembl gene: {}".format( transcript.refseq, transcript.ensembl, transcript.gene_GeneID, transcript.gene_ensembl)) # if ensemblkey: # gene = self.data.Genes.get(transcript.gene_ensembl, self.data.Genes[transcript.gene_GeneID]) # # syno = gene.synonyms # else: # gene = self.data.Genes[transcript.gene_GeneID] # # syno = [self.data.Genes[transcript.gene_GeneID].synonyms # # if transcript.gene_GeneID is not None else None][0] values = (gene.GeneID, gene.ensembl, gene.symbol, gene.synonyms, gene.chromosome, gene.strand, self.species,) cur.execute(''' INSERT INTO Genes (gene_GeneID_id, gene_ensembl_id, gene_symbol, synonyms, chromosome,\ strand, specie) VALUES (?, ?, ?, ?, ?, ?, ?)''', values) geneSet.add(gene.GeneID) geneSet.add(gene.ensembl) geneSet = geneSet - {None} start_abs, stop_abs = transcript.exons2abs() ex_num = 0 starts = transcript.exon_starts.copy() ends = transcript.exon_ends.copy() for iEx in range(e_counts): ex_num += 1 # insert into Transcript_Exon table values = (transcript.refseq, transcript.ensembl, ex_num, starts[iEx], ends[iEx], start_abs[iEx], stop_abs[iEx],) cur.execute(''' INSERT INTO Transcript_Exon (transcript_refseq_id, transcript_ensembl_id, order_in_transcript,\ genomic_start_tx, genomic_end_tx, abs_start_CDS, abs_end_CDS) VALUES (?, ?, ?, ?, ?, ?, ?)''', values) # insert into Exons table values = (transcript.gene_GeneID, transcript.gene_ensembl, starts[iEx], ends[iEx],) if values not in uExon: uExon.add(values) cur.execute('''INSERT INTO Exons (gene_GeneID_id, gene_ensembl_id, genomic_start_tx, genomic_end_tx) VALUES (?, ?, ?, ?)''', values) # insert into Proteins table if ensemblkey: protID = transcript.protein_ensembl else: protID = transcript.protein_refseq protein = self.data.Proteins[protID] values = (protein.refseq, protein.ensembl, protein.description, protein.length, protein.synonyms, transcript.refseq, transcript.ensembl,) cur.execute(''' INSERT INTO Proteins (protein_refseq_id, protein_ensembl_id, description, length, synonyms, transcript_refseq_id, transcript_ensembl_id) VALUES (?, ?, ?, ?, ?, ?, ?)''', values) splicin = set() # domeve = set() Domdf = pd.DataFrame(columns=["protein_refseq_id", "protein_ensembl_id", "type_id", "AA_start", "AA_end", "nuc_start", "nuc_end", "total_length", "ext_id", "splice_junction", "complete_exon"]) for reg in self.data.Domains.get(protID, [None]): if reg is None: continue regID = self.DomainOrg.addDomain(reg) if regID is None: continue relevantDomains.add(regID) relation, exon_list, length = reg.domain_exon_relationship(start_abs, stop_abs) total_length = reg.nucEnd - reg.nucStart + 1 # adding one because coordinates are full-closed! splice_junction = 0 complete = 0 if relation == 'splice_junction': splice_junction = 1 for i in range(len(exon_list)): values = (transcript.refseq, transcript.ensembl, exon_list[i], reg.nucStart, regID, total_length, length[i], i + 1,) if values not in splicin: cur.execute(''' INSERT INTO SpliceInDomains (transcript_refseq_id, transcript_ensembl_id,\ exon_order_in_transcript, domain_nuc_start, type_id,\ total_length, included_len, exon_num_in_domain) VALUES (?, ?, ?, ?, ?, ?, ?, ?)''', values) splicin.add(values) elif relation == 'complete_exon': complete = 1 # insert into domain event table ldf = Domdf.shape[0] extWithInter = "; ".join([reg.extID, self.DomainOrg.allDomains[regID][-1]]) if \ self.DomainOrg.allDomains[regID][-1] is not None else reg.extID values = (protein.refseq, protein.ensembl, regID, reg.aaStart, reg.aaEnd, reg.nucStart, reg.nucEnd, total_length, extWithInter, splice_junction, complete,) Domdf.loc[ldf] = list(values) Domdf = Domdf.drop_duplicates() Domdf = Domdf.fillna(-1) Domdf = Domdf.groupby(["protein_refseq_id", "protein_ensembl_id", "type_id", "AA_start", "AA_end", "nuc_start", "nuc_end", "total_length", "splice_junction", "complete_exon"], as_index=False, sort=False).agg( lambda col: "; ".join(set(col))) # groupby all besides ext_ID Domdf = Domdf.replace(-1, np.nan) Domdf.to_sql("DomainEvent", con, if_exists="append", index=False) # ~~~ end of loop iterating over transcripts ~~~ bp = time.time() if merged: relevantDomains = preDomains.union(relevantDomains) print('Recreating the table: DomainType and update domains') cur.executescript("DROP TABLE IF EXISTS DomainType;") print('Creating the table: DomainType') cur.execute(''' CREATE TABLE DomainType( type_id INTEGER NOT NULL PRIMARY KEY UNIQUE, name TEXT, other_name TEXT, description TEXT, CDD_id TEXT, cdd TEXT, pfam TEXT, smart TEXT, tigr TEXT, interpro TEXT );''' ) # insert into domain type table postDomains = set(self.DomainOrg.allDomains.keys()) print("from all {} domains in organizer, {} not in relevant domains".format(len(postDomains), len(postDomains.difference(relevantDomains)))) for typeID in relevantDomains: if typeID in self.DomainOrg.allDomains.keys(): values = (typeID,) + self.DomainOrg.allDomains[typeID] cur.execute(''' INSERT INTO DomainType (type_id, name, other_name, description, CDD_id, cdd,\ pfam, smart, tigr, interpro) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', values) print("#### Filling in domain type table: %s seconds" % (time.time() - bp)) con.commit() # ~~~ disconnect database ~~~ def AddOrthology(self, orthologsDict): """ This function adds the orthology data to the database, only for the genes included in the database. Changes the database with no returned output. @param orthologsDict: created by OrthologsBuilder module, called by the main script. @return: None """ MainOrtho = pd.DataFrame(columns=['A_ensembl_id', 'A_GeneSymb', 'A_species', 'B_ensembl_id', 'B_GeneSymb', 'B_species']) db_data = dict() orthology_species = set([spec for x in orthologsDict.keys() for spec in x]) with connect(self.dbName + '.sqlite') as con: cur = con.cursor() schema = cur.execute("PRAGMA table_info('Orthology')").fetchall() for spec in orthology_species: db_data[spec] = pd.read_sql( "SELECT gene_ensembl_id,gene_symbol,specie FROM Genes WHERE specie='{}'".format(spec), con) print("collecting orthology data for:") for couple, ortho in orthologsDict.items(): print("\t{} and {}".format(couple[0], couple[1])) merged_df = None n = 0 for spec in couple: db_data[spec]['gene_symbol'] = db_data[spec]['gene_symbol'].str.upper() db_data[spec].columns = db_data[spec].columns.str.replace('gene_ensembl_id', spec + "_ID") if n == 0: merged_df = pd.merge(db_data[spec], ortho) else: merged_df = pd.merge(db_data[spec], merged_df) label = 'A' if n == 0 else 'B' merged_df.columns = merged_df.columns.str.replace("specie", label + "_Species") merged_df.columns = merged_df.columns.str.replace("gene_symbol", label + "_GeneSymb") merged_df.columns = merged_df.columns.str.replace(spec + "_ID", label + "_ensembl_id") merged_df = merged_df.drop(spec + "_name", axis=1) n += 1 MainOrtho = MainOrtho.append(merged_df, sort=False) MainOrtho = MainOrtho.drop_duplicates() MainOrtho = MainOrtho.groupby(["A_ensembl_id", "B_ensembl_id"], as_index=False, sort=False).agg( lambda col: ', '.join(set(col))) print("Filling in Orthology table...") try: MainOrtho.to_sql("Orthology", con, if_exists="replace", schema=schema, index=False) except Exception as err: print(err) MainOrtho.to_csv("OrthologyTable.Failed.csv") print("Filling Orthology table complete!")
# Configure logging app_logger = logging.getLogger(__name__) app_logger.setLevel(logging.INFO) es_logger = logging.getLogger('elasticsearch.trace') es_logger.setLevel(logging.WARNING) handler = logging.FileHandler(filename='burner_%s.log' % datetime.datetime.now().strftime('%Y%m%d'), mode='a') handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter("%(asctime)s %(levelname)s %(message)s", "%Y-%m-%d %H:%M")) app_logger.addHandler(handler) es_logger.addHandler(handler) app_logger.info('Started') # Read configuration file config = ConfigParser.RawConfigParser() config.read('config.cfg') collector = Collector.Collector(config, app_logger) updater = Updater(config, app_logger) updater.store_users_expenses(collector.get_users_expenses()) app_logger.info('Completed\n')
class Simulation(object): def checkIfPositionIsEmpty(self, position): for i in self.mapElements: if i.position == position: return False return True def __init__(self, binsAmount): self.gridWidth = 20 self.gridHeight = 9 self.fieldSize = 64 self.window = Tk() self.canvas = Canvas(self.window, width=self.fieldSize * self.gridWidth, height=self.fieldSize * self.gridHeight) self.binsAmount = binsAmount self.window.title("Simulation") self.collector = Collector(1, 1, 1) self.positionsToVisit = [] self.mapElements = [] self.addDumps() self.addRoads() self.addBins() self.addGrass() self.MovementLogic = MovementLogic(self.mapElements, self.gridWidth, self.gridHeight) self.classifier = GarbageClassifier("learningExamples.txt") def addDumps(self): types = ['plastic', 'paper', 'glass', 'other'] n = 0 for j in types: new = Dump(n, 0, j) n = n + 1 self.mapElements.append(new) def addRoad(self, position1, position2): if position1[0] == position2[0]: for i in range(position1[1], position2[1] + 1): if self.checkIfPositionIsEmpty([position1[0], i]): element = Road(position1[0], i) self.mapElements.append(element) elif position1[1] == position2[1]: for i in range(position1[0], position2[0] + 1): if self.checkIfPositionIsEmpty([i, position1[1]]): element = Road(i, position1[1]) self.mapElements.append(element) def addRoads(self): self.addRoad([0, 1], [self.gridWidth, 1]) self.addRoad([0, 4], [self.gridWidth, 4]) self.addRoad([0, 7], [self.gridWidth, 7]) r = randint(1, 6) for i in range(0, r): s = randint(1, self.gridWidth - 2) self.addRoad([s, 1], [s, self.gridHeight - 2]) def addBins(self): for i in range(0, self.binsAmount): rightPosition = False while not rightPosition: x = randint(0, self.gridWidth - 1) y = randint(0, self.gridHeight - 1) if self.checkIfPositionIsEmpty([x, y]): rightPosition = True element = Bin(x, y) self.positionsToVisit.append([x, y]) self.mapElements.append(element) def addGrass(self): for i in range(0, self.gridWidth): for j in range(0, self.gridHeight): if self.checkIfPositionIsEmpty([i, j]): element = Grass(i, j) self.mapElements.append(element) def display(self): for i in self.mapElements: x = i.position[0] y = i.position[1] self.canvas.create_image(x * self.fieldSize, y * self.fieldSize, image=i.image, anchor=NW) x = self.collector.state.position[0] y = self.collector.state.position[1] self.canvas.create_image(x * self.fieldSize, y * self.fieldSize, image=self.collector.image, anchor=NW) self.canvas.pack() def update(self): self.display() self.window.update_idletasks() self.window.update() time.sleep(0.5) def classify(self): for i in range(0, 5): r = randint(1, 40) name = "./photos/test/test" + str(r) + ".jpg" im = ImageExample(name) image = ImageTk.PhotoImage(Image.open(name)) result = self.classifier.test(im.getString()) self.canvas.create_image(350, 100, image=image, anchor=NW) self.canvas.pack() self.window.update_idletasks() self.window.update() time.sleep(0.5) self.canvas.create_text(420, 150, fill="black", font="Times 20", text=result) self.canvas.pack() self.window.update_idletasks() self.window.update() time.sleep(2) def predictDigits(self): sess = tf.Session() saver = tf.train.import_meta_graph('./src/model/my-model.meta') saver.restore(sess, tf.train.latest_checkpoint('./model')) print("Model został wczytany.") graph = tf.get_default_graph() output_layer = graph.get_tensor_by_name("output:0") X = graph.get_tensor_by_name("X:0") r = randint(0, 9) img = np.invert( Image.open("../test_digits/house_test_" + str(r) + ".png")) prediction = sess.run(tf.argmax(output_layer, 1), feed_dict={X: [img]}) print("Rozpoznanie dla testowanego obrazka:", np.squeeze(prediction)) def start(self): for p in self.positionsToVisit: for zz in self.mapElements: if zz.position == p: zz.searching = True zz.updateImage() self.update() actions = self.MovementLogic.getActions(self.collector.state, p) if actions is not None: for i in actions: print(i) self.update() self.collector.doAction(i) self.update() self.predictDigits() self.classify() for zz in self.mapElements: if zz.position == p: zz.searching = False zz.updateImage()
def data(): session = boto3.Session(region_name='us-west-2') collector = Collector(session) headers = {'Content-Type': 'application/json'} return (collector.collect(), 200, headers)
#thread management from queue import Queue from collections import deque import time from Collector import Collector from Detector import Detector from Processor import Processor # parameters maxqueuelen = 200 if __name__ == "__main__": # queue for images from camera queue_raw = Queue(maxsize=maxqueuelen) # queue for boat candidates & frame queue_detectors = Queue(maxsize=maxqueuelen) collector = Collector(queue_raw) detector = Detector(queue_raw, queue_detectors) processor = Processor(queue_detectors) collector.start() detector.start() processor.start() while True: print("queue_raw len: {} queue_detectors len: {}".\ format( queue_raw.qsize(), queue_detectors.qsize())) time.sleep(60)
def run(samples,channel, use, train,short, preprocess_chain = []): if use == "xgb": from XGBModel import XGBObject as modelObject parameters = "conf/parameters_xgb.json" if use == "keras": from KerasModel import KerasObject as modelObject parameters = "conf/parameters_keras.json" read = Reader(channel = channel, config_file = samples, folds=2) target_names = read.config["target_names"] variables = read.config["variables"] if not os.path.exists("models"): os.mkdir("models") modelname = "models/{0}.{1}".format(channel,use) scaler = None if train: print "Training new model" print "Loading Training set" trainSet = read.getSamplesForTraining() print "Fit Scaler to training set...", scaler = trainScaler(trainSet, variables ) print " done. Dumping for later." with open("models/StandardScaler.{0}.pkl".format(channel), 'wb') as FSO: cPickle.dump(scaler, FSO , 2) trainSet = applyScaler(scaler, trainSet, variables) model = modelObject( parameter_file = parameters, variables=variables, target_names = target_names ) model.train( trainSet ) model.save(modelname) else: if os.path.exists("models/StandardScaler.{0}.pkl".format(channel) ): print "Loading Scaler" with open( "models/StandardScaler.{0}.pkl".format(channel), "rb" ) as FSO: scaler = cPickle.load( FSO ) print "Loading model and predicting." model = modelObject( filename = modelname ) where = "" coll = Collector( channel = channel, var_name = "pred_prob", target_names = target_names, path = use, recreate = True, rebin = False ) print "Predicting simulation" for sample, sampleName in read.get(what = "nominal"): pred = model.predict( applyScaler(scaler, sample, variables), where ) coll.addPrediction(pred, sample, sampleName) print "Adding looser samples to predictions" for sample, sampleName in read.get(what = "more"): pred = model.predict( applyScaler(scaler, sample, variables), where ) coll.addPrediction(pred, sample, sampleName) print "Predicting data" for sample, sampleName in read.get(what = "data"): pred = model.predict( applyScaler(scaler, sample, variables), where ) coll.addPrediction(pred, sample, sampleName) if not short: print "Predicting TES shapes" for sample, sampleName in read.get(what = "tes"): pred = model.predict( applyScaler(scaler, sample, variables), where ) coll.addPrediction(pred, sample, sampleName) print "Predicting JES shapes" for sample, sampleName in read.get(what = "jec"): pred = model.predict( applyScaler(scaler, sample, variables), where ) coll.addPrediction(pred, sample, sampleName) coll.createDC(writeAll = True) plot = Plotter( channel= channel, naming = read.processes, path = use ) plot.makePlots()
class dbBuilder: def __init__(self, species, download=False, withEns=True): self.species = species self.dbName = None self.data = Collector(self.species) self.data.collectAll(download=download, withEns=withEns) self.TranscriptNoProteinRec = {} self.DomainsSourceDB = 'DB_merged.sqlite' self.DomainOrg = DomainOrganizer() def create_tables_db(self, merged=True, dbName=None): """ Create a transcripts table in the specie database and fills with ucsc transcripts data """ if dbName is not None: self.dbName = dbName elif merged: self.dbName = 'DB_merged' else: self.dbName = 'DB_' + self.species print("Creating database: {}...".format(self.dbName)) with connect(self.dbName + '.sqlite') as con: cur = con.cursor() cur.executescript('DROP TABLE IF EXISTS Genes;') print('Creating the table: Genes') cur.execute(''' CREATE TABLE Genes( gene_GeneID_id TEXT, gene_ensembl_id TEXT, gene_symbol TEXT, synonyms TEXT, chromosome TEXT, strand TEXT, specie TEXT, PRIMARY KEY(gene_GeneID_id, gene_ensembl_id, gene_symbol) );''' ) cur.executescript("DROP TABLE IF EXISTS transcripts;") print('Creating the table: Transcripts') cur.execute(''' CREATE TABLE Transcripts( transcript_refseq_id TEXT, transcript_ensembl_id TEXT, tx_start INTEGER, tx_end INTEGER, cds_start INTEGER, cds_end INTEGER, exon_count INTEGER, gene_GeneID_id TEXT, gene_ensembl_id TEXT, protein_refseq_id TEXT, protein_ensembl_id TEXT, PRIMARY KEY (transcript_refseq_id, transcript_ensembl_id), FOREIGN KEY(gene_GeneID_id, gene_ensembl_id) REFERENCES Genes(gene_GeneID_id, gene_ensembl_id), FOREIGN KEY(protein_refseq_id,protein_ensembl_id) REFERENCES Proteins(protein_refseq_id,protein_ensembl_id) );''' ) cur.executescript("DROP TABLE IF EXISTS Exons;") print('Creating the table: Exons') cur.execute(''' CREATE TABLE Exons( gene_GeneID_id TEXT, gene_ensembl_id TEXT, genomic_start_tx INTEGER, genomic_end_tx INTEGER, PRIMARY KEY (gene_GeneID_id, gene_ensembl_id, genomic_start_tx, genomic_end_tx), FOREIGN KEY(gene_GeneID_id, gene_ensembl_id) REFERENCES Genes(gene_GeneID_id, gene_ensembl_id) );''' ) cur.executescript("DROP TABLE IF EXISTS Transcript_Exon;") print('Creating the table: Transcript_Exon') cur.execute(''' CREATE TABLE Transcript_Exon( transcript_refseq_id TEXT, transcript_ensembl_id TEXT, order_in_transcript INTEGER, genomic_start_tx INTEGER, genomic_end_tx INTEGER, abs_start_CDS INTEGER, abs_end_CDS INTEGER, PRIMARY KEY(transcript_refseq_id, transcript_ensembl_id, order_in_transcript), FOREIGN KEY(transcript_refseq_id, transcript_ensembl_id) REFERENCES Transcripts(transcript_refseq_id, transcript_ensembl_id), FOREIGN KEY(genomic_start_tx, genomic_end_tx)\ REFERENCES Exons(genomic_start_tx, genomic_end_tx) );''' ) cur.executescript("DROP TABLE IF EXISTS Proteins;") print('Creating the table: Proteins') cur.execute(''' CREATE TABLE Proteins( protein_refseq_id TEXT, protein_ensembl_id TEXT, description TEXT, synonyms TEXT, length INTEGER, transcript_refseq_id TEXT, transcript_ensembl_id TEXT, PRIMARY KEY(protein_refseq_id, protein_ensembl_id), FOREIGN KEY(transcript_refseq_id, transcript_ensembl_id) REFERENCES Transcripts(transcript_refseq_id, transcript_ensembl_id) );''' ) cur.executescript("DROP TABLE IF EXISTS DomainType;") print('Creating the table: DomainType') cur.execute(''' CREATE TABLE DomainType( type_id INTEGER NOT NULL PRIMARY KEY UNIQUE, name TEXT, other_name TEXT, description TEXT, CDD_id TEXT, cdd TEXT, pfam TEXT, smart TEXT, tigr TEXT, interpro TEXT );''' ) cur.executescript("DROP TABLE IF EXISTS DomainEvent;") print('Creating the table: DomainEvent') cur.execute(''' CREATE TABLE DomainEvent( protein_refseq_id TEXT, protein_ensembl_id TEXT, type_id INTEGER, AA_start INTEGER, AA_end INTEGER, nuc_start INTEGER, nuc_end INTEGER, total_length INTEGER, ext_id TEXT, splice_junction BOOLEAN, complete_exon BOOLEAN, PRIMARY KEY(protein_refseq_id, protein_ensembl_id, type_id, AA_start, total_length, ext_id), FOREIGN KEY(type_id) REFERENCES DomainType(type_id), FOREIGN KEY(protein_refseq_id, protein_ensembl_id) REFERENCES Proteins(protein_refseq_id, protein_ensembl_id) );''' ) cur.executescript("DROP TABLE IF EXISTS SpliceInDomains;") print('Creating the table: SpliceInDomains') cur.execute(""" CREATE TABLE SpliceInDomains( transcript_refseq_id TEXT, transcript_ensembl_id TEXT, exon_order_in_transcript INTEGER, type_id INTEGER, total_length INTEGER, domain_nuc_start INTEGER, included_len INTEGER, exon_num_in_domain INTEGER, PRIMARY KEY (transcript_refseq_id, transcript_ensembl_id, exon_order_in_transcript, type_id,\ total_length, domain_nuc_start), FOREIGN KEY(transcript_refseq_id, transcript_ensembl_id) REFERENCES Transcripts(transcript_refseq_id, transcript_ensembl_id), FOREIGN KEY(exon_order_in_transcript) REFERENCES Transcript_Exon(order_in_transcript), FOREIGN KEY(type_id) REFERENCES DomainType(type_id), FOREIGN KEY(domain_nuc_start, total_length) REFERENCES DomainEvent(Nuc_start, total_length) );""" ) if merged: cur.executescript("DROP TABLE IF EXISTS Orthology;") print('Creating the table: Orthology') cur.execute(""" CREATE TABLE Orthology( A_ensembl_id TEXT, A_GeneSymb TEXT, A_Species TEXT, B_ensembl_id TEXT, B_GeneSymb TEXT, B_Species TEXT, PRIMARY KEY (A_ensembl_id, B_ensembl_id), FOREIGN KEY (A_ensembl_id, B_ensembl_id, A_GeneSymb, B_GeneSymb, A_Species, B_Species) REFERENCES Genes(gene_ensembl_id, gene_ensembl_id, gene_symbol, gene_symbol, specie, specie) );""" ) def create_index(self): with connect(self.dbName + '.sqlite') as con: cur = con.cursor() cur.execute('''CREATE INDEX geneTableIndexBySpecies ON Genes(specie);''') cur.execute('''CREATE INDEX transcriptTableIndexByGene ON Transcripts(gene_GeneID_id) ;''') cur.execute( '''CREATE INDEX exonsInTranscriptsTableIndexByTranscripts ON Transcript_Exon(transcript_refseq_id) ;''') cur.execute('''CREATE INDEX domainEventsTableIndexByProtein ON DomainEvent(protein_refseq_id) ;''') cur.execute('''CREATE INDEX domainEventsTableIndexByEnsembl ON DomainEvent(protein_ensembl_id);''') cur.execute( '''CREATE INDEX exonInTranscriptsTableIndexByEnsembl ON Transcript_Exon(transcript_ensembl_id);''') def fill_in_db(self, CollectDomainsFromMerged=True, merged=True, dbName=None): """ This function in for unique species. for more than ine use add Species To Merged """ if dbName is not None: self.dbName = dbName elif merged: self.dbName = 'DB_merged' else: self.dbName = 'DB_' + self.species if CollectDomainsFromMerged: self.DomainOrg.collectDatafromDB(self.DomainsSourceDB) Transcripts = pd.DataFrame(columns=["transcript_refseq_id", "transcript_ensembl_id", "tx_start", "tx_end", "cds_start", "cds_end", "exon_count", "gene_GeneID_id", "gene_ensembl_id", "protein_refseq_id", "protein_ensembl_id"]) Genes = pd.DataFrame(columns=["gene_GeneID_id", "gene_ensembl_id", "gene_symbol", "synonyms", "chromosome", "strand", "specie"]) Transcript_Exon = pd.DataFrame(columns=["transcript_refseq_id", "transcript_ensembl_id", "order_in_transcript", "genomic_start_tx", "genomic_end_tx", "abs_start_CDS", "abs_end_CDS"]) Exons = pd.DataFrame(columns=["gene_GeneID_id", "gene_ensembl_id", "genomic_start_tx", "genomic_end_tx"]) Proteins = pd.DataFrame(columns=["protein_refseq_id", "protein_ensembl_id", "description", "length", "synonyms", "transcript_refseq_id", "transcript_ensembl_id"]) SpliceInDomains = pd.DataFrame(columns=["transcript_refseq_id", "transcript_ensembl_id", "exon_order_in_transcript", "domain_nuc_start", "type_id", "total_length", "included_len", "exon_num_in_domain"]) DomainEvent = pd.DataFrame(columns=["protein_refseq_id", "protein_ensembl_id", "type_id", "AA_start", "AA_end", "nuc_start", "nuc_end", "total_length", "ext_id", "splice_junction", "complete_exon"]) DomainType = pd.DataFrame(columns=["type_id", "name", "other_name", "description", "CDD_id", "cdd", "pfam", "smart", "tigr", "interpro"]) print("Filling in the tables...") geneSet = set() uExon = set() domeve = set() relevantDomains = set() for tID, transcript in self.data.Transcripts.items(): ensemblkey = False if tID.startswith("ENS"): ensemblkey = True e_counts = len(transcript.exon_starts) # insert into Transcripts table if transcript.CDS is None: transcript.CDS = transcript.tx values = (transcript.refseq, transcript.ensembl,) + transcript.tx + transcript.CDS + \ (e_counts, transcript.gene_GeneID, transcript.gene_ensembl, transcript.protein_refseq, transcript.protein_ensembl,) idx = len(Transcripts) Transcripts.loc[idx] = list(values) # insert into Genes table if transcript.gene_GeneID not in geneSet and transcript.gene_ensembl not in geneSet: if ensemblkey: gene = self.data.Genes[transcript.gene_ensembl] else: gene = self.data.Genes[transcript.gene_GeneID] values = (gene.GeneID, gene.ensembl, gene.symbol, gene.synonyms, gene.chromosome, gene.strand, self.species,) idx = len(Genes) Genes.loc[idx] = list(values) geneSet.add(gene.GeneID) geneSet.add(gene.ensembl) geneSet = geneSet - {None} start_abs, stop_abs = transcript.exons2abs() ex_num = 0 starts = transcript.exon_starts.copy() ends = transcript.exon_ends.copy() for iEx in range(e_counts): ex_num += 1 # insert into Transcript_Exon table values = (transcript.refseq, transcript.ensembl, ex_num, starts[iEx], ends[iEx], start_abs[iEx], stop_abs[iEx],) idx = len(Transcript_Exon) Transcript_Exon.loc[idx] = list(values) # insert into Exons table values = (transcript.gene_GeneID, transcript.gene_ensembl, starts[iEx], ends[iEx],) if values not in uExon: uExon.add(values) idx = len(Exons) Exons.loc[idx] = list(values) # insert into Proteins table if ensemblkey: protID = transcript.protein_ensembl else: protID = transcript.protein_refseq protein = self.data.Proteins[protID] values = (protein.refseq, protein.ensembl, protein.description, protein.length, protein.synonyms, transcript.refseq, transcript.ensembl,) idx = len(Proteins) Proteins.loc[idx] = list(values) splicin = set() for reg in self.data.Domains.get(protID, [None]): if reg is None: continue regID = self.DomainOrg.addDomain(reg) if regID is None: continue relevantDomains.add(regID) relation, exon_list, length = reg.domain_exon_relationship(start_abs, stop_abs) total_length = reg.nucEnd - reg.nucStart + 1 # adding one because coordinates are full-closed! splice_junction = 0 complete = 0 if relation == 'splice_junction': splice_junction = 1 for i in range(len(exon_list)): values = (transcript.refseq, transcript.ensembl, exon_list[i], reg.nucStart, regID, total_length, length[i], i + 1,) if values not in splicin: idx = len(SpliceInDomains) SpliceInDomains.loc[idx] = list(values) splicin.add(values) elif relation == 'complete_exon': complete = 1 # insert into domain event table values = (protein.refseq, protein.ensembl, regID, reg.aaStart, reg.aaEnd, reg.nucStart, reg.nucEnd, total_length, reg.extID, splice_junction, complete,) if values not in domeve: idx = len(DomainEvent) DomainEvent.loc[idx] = list(values) domeve.add(values) bp = time.time() if merged: relevantDomains = set(self.DomainOrg.allDomains.keys()) # insert into domain type table for typeID in relevantDomains: if typeID in self.DomainOrg.allDomains.keys(): values = (typeID,) + self.DomainOrg.allDomains[typeID] idx = len(DomainType) DomainType.loc[idx] = list(values) print("#### Filling in domain type table: %s seconds" % (time.time() - bp)) with connect(self.dbName + '.sqlite') as con: Transcripts.to_sql("Transcripts", con, if_exists="append", index=False) Genes.to_sql("Genes", con, if_exists="append", index=False) Proteins.to_sql("Proteins", con, if_exists="append", index=False) Transcript_Exon.to_sql("Transcript_Exon", con, if_exists="append", index=False) Exons.to_sql("Exons", con, if_exists="append", index=False) SpliceInDomains.to_sql("SpliceInDomains", con, if_exists="append", index=False) DomainEvent.to_sql("DomainEvent", con, if_exists="append", index=False) if merged: cur.executescript("DROP TABLE IF EXISTS Orthology;") print('Creating the table: Orthology') cur.execute(""" CREATE TABLE Orthology( A_ensembl_id TEXT, A_GeneSymb TEXT, A_Species TEXT, B_ensembl_id TEXT, B_GeneSymb TEXT, B_Species TEXT, PRIMARY KEY (A_ensembl_id, B_ensembl_id), FOREIGN KEY (A_ensembl_id, B_ensembl_id, A_GeneSymb, B_GeneSymb, A_Species, B_Species) REFERENCES Genes(gene_ensembl_id, gene_ensembl_id, gene_symbol, gene_symbol, specie, specie) );""" ) DomainType.to_sql("DomainType", con, if_exists="append", index=False) def AddOrthology(self, orthologsDict): MainOrtho = pd.DataFrame(columns=['A_ensembl_id', 'A_GeneSymb', 'A_Species', 'B_ensembl_id', 'B_GeneSymb', 'B_Species']) db_data = dict() species = [spec for x in orthologsDict.keys() for spec in x] with connect(self.dbName + '.sqlite') as con: for spec in species: db_data[spec] = pd.read_sql( "SELECT gene_ensembl_id,gene_symbol,specie FROM Genes WHERE specie='{}'".format(spec), con) print("collecting orthology data for:") for couple, ortho in orthologsDict.items(): print("\t{} and {}".format(couple[0], couple[1])) merged_df = None n = 0 for spec in couple: db_data[spec]['gene_symbol'] = db_data[spec]['gene_symbol'].str.upper() db_data[spec].columns = db_data[spec].columns.str.replace('gene_ensembl_id', spec + "_ID") if n == 0: merged_df = pd.merge(db_data[spec], ortho) else: merged_df = pd.merge(db_data[spec], merged_df) label = 'A' if n == 0 else 'B' merged_df.columns = merged_df.columns.str.replace("specie", label + "_Species") merged_df.columns = merged_df.columns.str.replace("gene_symbol", label + "_GeneSymb") merged_df.columns = merged_df.columns.str.replace(spec + "_ID", label + "_ensembl_id") merged_df = merged_df.drop(spec + "_name", axis=1) n += 1 MainOrtho = MainOrtho.append(merged_df, sort=False) print("Filling in Orthology table...") with connect(self.dbName + '.sqlite') as con: MainOrtho.to_sql("Orthology", con, if_exists="append", index=False) print("Filling Orthology table complete!")
from tkinter import * import os from tkinter.filedialog import * from tkinter import messagebox from threading import Thread from sys import path as systemPath if __name__ == "__main__": nope = False systemPath.insert(1, "src/Scripts/") from Collector import Collector collector = Collector(systemPath) from Loader import Loader __loader = Loader() tk = Tk() tk.withdraw() tk.overrideredirect(True) tk.resizable(False, False) tk.geometry("%dx%d+%d+%d" % (1, 1, 1, 1)) __loader.tk = tk __loader.collector = collector from Monitor import Monitor __screenSize = Monitor().get_screensize() __loader.screenSize = __screenSize