Example #1
0
def encode(prog_ast,encoding,bq=None):
""" Create an optimized binary encoding of the input program in a BitQueue. """
    if bq is None: bq=bitqueue.BitQueue()
    
    code_table = ddict(int)
    
    #comcounts is an encoding dictionary of a list of function dictionaries.
    enccounts0,enccounts1,enccounts2 = comcounts[encoding]

    #then, group up functions based on rank within arity and across signatures of the same arity, and create a mapping from the "group name" to their combined counts, and likewise from a signature to its groupname. from this combined list, make a new huffman tree.
    
    #handle nullary functions first (they go straight in the table because they take no arguments and are already in the right format)
    code_table.update(enccounts0)
    
    #now unary functions
    #enccounts1 maps argumenttypestr to dictionaries which map function names to counts.
    unarygroupmap = ddict(int)
    for astr,funcmap in enccounts1.items():
        for i,func in enumerate(sorted(list(funcmap),key=lambda x: funcmap[x])):
            code_table["unarygroup"+str(i)]+=funcmap[func]
            unarygroupmap[func].append(i)
    
    #now binary functions
    binarygroupmap = ddict(int)
    for astr,funcmap in enccounts2.items():
        for i,func in enumerate(sorted(list(funcmap),key=lambda x: funcmap[x])):
            code_table["binarygroup"+str(i)]+=funcmap[func]
            unarygroupmap[func].append(i)
    
    h = HuffmanTree(code_table)
            
    #now that we have the huffman tree built, we need to find the ideal positions of everything in the output program
    
    def __init__(self, data):
        self.data = data
        need_authors = get_need_authors(self.data)
        need_papers = get_need_papers(self.data)
        paper_author_pacnt = ddict(lambda: ddict(int))
        author_paperset = dict()
        paper_author_name = ddict(lambda: ddict(set))
        for (paperid, authorid, name, affi) in self.data.paperauthor_tuples:
            if paperid in need_papers:
                paper_author_pacnt[paperid][authorid] += 1
                if len(name) > 0:
                    paper_author_name[paperid][authorid].add(name)
        
        
        for paperid,author_pacnt in paper_author_pacnt.items():
            for author, pacnt in author_pacnt.items():
                if author in need_authors or pacnt == 2:
                    author_paperset[author] = set()

        for (paperid, authorid, name, affi) in self.data.paperauthor_tuples:
            if authorid in author_paperset:
                author_paperset[authorid].add(paperid)

        self.author_paperset = author_paperset
        self.paper_author_pacnt = paper_author_pacnt
        self.paper_author_name = paper_author_name
        return
Example #3
0
def run(trainfile, doc_mappid_file, out, num_mapper):
	docid_mapperid = load_docid_mapperid(doc_mappid_file)
	mapperid_cnt = ddict(int)
	for mapperid in docid_mapperid.values():
		mapperid_cnt[mapperid] += 1

	mapperid_partid = dict()
	partid_cnt = ddict(int)
	for mappid in sorted(mapperid_cnt.keys(), key=lambda x:-mapperid_cnt[x]):
		min_partid = -1
		for partid in range(num_mapper):
			if min_partid == -1 or partid_cnt[partid] < partid_cnt[min_partid]:
				min_partid = partid
		mapperid_partid[mappid] = min_partid
		partid_cnt[min_partid] += mapperid_cnt[mappid]

	fps = dict()
	
	for partid in mapperid_partid.values():
		fps[partid] = open('%s/part-%d' % (out, partid),'w')
	
	num_line = 0
	for line in file(trainfile):
		mappid = docid_mapperid[num_line]
		print >> fps[mapperid_partid[mappid]], '%d %s' % (num_line, line[:-1])
		num_line += 1

	for fp in fps.values():
		fp.close()
Example #4
0
def run_main(valid_csv, valid_gt_csv):
    csv_reader = csv.reader(file(valid_csv))
    csv_reader.next()
    author_papercnt = ddict(lambda :ddict(int))
    for cols in csv_reader:
        authorid = int(cols[0])
        for paperid in map(int, cols[1].split()):
            author_papercnt[authorid][paperid] += 1

    csv_reader = csv.reader(file(valid_gt_csv))
    csv_reader.next()

    print 'Authorids,ConfirmedPapers,DeletedPapers'
    for cols in csv_reader:
        authorid = int(cols[0])
        confirmedpapers = list()
        for paperid in map(int, cols[1].split()):
            confirmedpapers.append(paperid)
            author_papercnt[authorid][paperid] -= 1

        deletedpapers = list()
        for paperid, cnt in author_papercnt[authorid].items():
            if cnt < 0:
                print >> sys.stderr, 'error'
            if cnt > 0:
                for i in range(cnt):
                    deletedpapers.append(paperid)

        print '%d,%s,%s' % (authorid, ' '.join(map(str,confirmedpapers)), ' '.join(map(str,deletedpapers)))
Example #5
0
def check_links():
  print "\nMissing Links"
  outs = []
  refs = ddict(lambda: ddict(set))

  for fn in glob('doc/*/*.html') + glob('doc/*/*/*.html'):
    # print fn
    soup = parse(fn)
    for link in soup.select('[href]'):
      href = link['href']
      url = absify(fn, href)
      if url.startswith('http'):
        outs.append(url)
      else:
        if '#' in url:
          pg, anchor = url.split('#')
          refs[pg]['anchors'].add('#'+anchor)
          url = pg
        refs[url]['refrs'].add(fn)
      # print "- %s :: <%s>" % (url, link['href'])
  refs = {k:dict((kind, sorted(lst)) for kind, lst in v.items()) for k,v in refs.items()}

  # broken links
  for fn,info in sorted(refs.items()):
    if not os.path.exists(fn):
      print "!",fn,'<-',info['refrs']
Example #6
0
def get_kegg_reactions():
    """
    :return:
    """
    import multiprocessing
    rp_record_by_id = ddict(lambda: ddict(set))

    reac_ids = kegg_parser.reactionIds
    print("# reacids: {0}".format(len(reac_ids)))

    p = multiprocessing.Pool(processes=multiprocessing.cpu_count())

    t = p.map(get_compounds, reac_ids, chunksize=20)

    for reactants_ids, product_ids in t:
        for id__ in reactants_ids:
            for id_ in product_ids:
                rp_record_by_id[id__]['as_r'].add(id_)
                rp_record_by_id[id_]['as_p'].add(id__)

    # transform value to list
    for key in rp_record_by_id:
        v_r = rp_record_by_id[key]['as_r']
        rp_record_by_id[key]['as_r'] = list(v_r)
        v_p = rp_record_by_id[key]['as_p']
        rp_record_by_id[key]['as_p'] = list(v_p)
    print("len rp record: {}".format(len(rp_record_by_id)))
    return rp_record_by_id
Example #7
0
 def _aggregate(self, kw):
     event_types = db.get_event_types()
     event_types_lookup = dict((id, name) for name, id in event_types.items())
     # Build the return value in this object
          #State        # CL region   # Daterange   #eventtype
     rv = ddict(lambda: ddict(lambda: ddict(lambda: ddict(
         lambda: dict((e,0) for e in kw['counts']))))) # And the actual counts...
     for event in db.get_all_events():
         if event['event_id_obfuscated'] in self.anomalous:
             continue
         if event_types_lookup[event['event_type_id']] in self.hidden_events:
             continue
         interval = bisect.bisect(kw['timebreaks'], event[kw['time_type']])
         if interval == 0 or interval == len(kw['timebreaks']):
             # Event lies outside requested time intervals
             continue
         # If we decide to make smaller queries from the front end,
         # a  lot of  this filtering  could be  moved into  the sql
         # query in db.py to make it faster.  But it's clearer here.
         if not ((event['venue_state_cd'] in kw['states'])    and
                 (event['clregion']       in kw['clregions']) and
                 (event['venue_zip']      in kw['zips'])      and
                 (event['event_type_id']  in kw['event_types'])):
             continue
         datestring = datetime.strftime(kw['timebreaks'][interval-1], '%Y-%m-%d')
         event_type = event_types_lookup[int(event['event_type_id'])]
         ccounts = rv[event['venue_state_cd']][event['clregion']][datestring][event_type]
         for counttype in kw['counts']:
             summand = {'count': 1, 'rsvp': int(event['attendee_count'])}[counttype]
             ccounts[counttype] += summand
     return deddict(rv)
Example #8
0
  def __init__ (self):
    self._next_key = 0

    self._edges = ddict(lambda:ddict(lambda:ddict(lambda:{})))
    # node -> node -> key -> {attr}

    self._nodes = {}
Example #9
0
def check_links():
    print "\nMissing Links"
    outs = []
    refs = ddict(lambda: ddict(set))

    for fn in glob("doc/*/*.html") + glob("doc/*/*/*.html"):
        # print fn
        soup = parse(fn)
        for link in soup.select("[href]"):
            href = link["href"]
            url = absify(fn, href)
            if url.startswith("http"):
                outs.append(url)
            else:
                if "#" in url:
                    pg, anchor = url.split("#")
                    refs[pg]["anchors"].add("#" + anchor)
                    url = pg
                refs[url]["refrs"].add(fn)
            # print "- %s :: <%s>" % (url, link['href'])
    refs = {k: dict((kind, sorted(lst)) for kind, lst in v.items()) for k, v in refs.items()}

    # broken links
    for fn, info in sorted(refs.items()):
        if not os.path.exists(fn):
            print "!", fn, "<-", info["refrs"]
Example #10
0
 def _compute_mult(self, node):
     W = ddict(int)
     rW = ddict(list)
     for g in node.get_children():
         s = self.lcamap[g]
         rW[s.name].append(g)
         W[s.name] += 1
     return W, rW
	def __init__(self, *args, **kwargs):
		super(MyController, self).__init__(*args, **kwargs) # Mandatory

		self.mac_tables = ddict(dict)  # {DPID => {MAC => PORT}}
		self.arp_table = dict() # {IP => MAC}
		self.switchports = ddict(dict) # {DPID => {PORTID => PORT_STATUS}}
                self.stats_requester_thread = hub.spawn(self._port_stats_requester)
                self.switches = {}     #{DPID => SWITCH}
Example #12
0
    def infer_assignment_probabilities(self, n_samples=200, n_burning_sample=10):
        """
        Main function
        :param n_samples:
        :param n_burning_sample:
        """
        assert n_samples > n_burning_sample, "n_burning_samples seems to be > to n_samples"
        #assign obvious
        self._assign_without_ambiguity()

        #g et counter for each feature in order to store sampling

        # mapping between annotation and its metabolite
        # may be useful to retrieve probabilities and assign
        # it the posterior prbability
        annotation_by_metab_id_by_feature = ddict(dict)

        # dict for counting the sampling result in each iteration
        # map a feature and collections.Counter
        counter_by_feature = ddict(Counter)

        # fill the 2 previous declared dicts
        for f in self.features:
            annotations = f.annotations
            #counter = Counter()
            for a in annotations:
                m = a.metabolite
                annotation_by_metab_id_by_feature[f][m.kegg_id] = a
                # we are counting the string
                #counter[m.kegg_id]  #= 0
                counter_by_feature[f][m.kegg_id] = 0  #= counter

        # initalization of prior probabilities
        logging.info("Init probabilities...")
        prob_by_metab_by_feature = self._init_probs()

        for i in xrange(n_samples):
            assigned_compounds = self.assigned_compounds
            rdm.shuffle(self.features)
            #for f in self.features:  #rdm.shuffle(self.features):
            self._sample(assigned_compounds, prob_by_metab_by_feature, counter_by_feature)
            self._update_probs(assigned_compounds, prob_by_metab_by_feature)
            sys.stdout.write("Progression:[*** " + str(int(round(float(i) / n_samples * 100))) + "% ***]" + chr(13))

        logging.info("Computing posterior probabilities...")
        # posterior probabilities calculation
        self._calc_posterior_probs(counter_by_feature,
                                   prob_by_metab_by_feature,
                                   n_samples, n_burning_sample)


        for f, prob_by_metab in prob_by_metab_by_feature.iteritems():
            for metab_id, prob in prob_by_metab.iteritems():
                if math.isnan(prob):
                    raise ValueError("nan probability")
                annotation_by_metab_id_by_feature[f][metab_id].score_network = prob

        logging.info("Done.")
Example #13
0
 def read_conference_jorunal_csv(self, conference_csv):
     csv_reader = csv.reader(file(conference_csv, 'r'))
     csv_reader.next()
     cid_info_dict = ddict(lambda:ddict())
     for cols in csv_reader:
         cid = int(cols[0])
         cid_info_dict[cid]['shortname'] = cols[1]
         cid_info_dict[cid]['longname'] = cols[2]
     return cid_info_dict
Example #14
0
def viterbi(sentence, tags, transition, emission):
    probs = ddict(lambda: ddict(lambda: -1))
    probs[-1]['<begin>'] = 1.0
    backpt = ddict(lambda: {})
    i = -1
    for i, word in enumerate(sentence):
        for tag, cur_count in tags.items():
            path_probs = {}
            if i == 0:
                prev_tag = '<begin>'
                tr = prev_tag+'_'+tag
                if tr in transition:
                    trans = float(transition[tr])
                else:
                    trans = -1
                em = tag+'_'+word
                if em in emission:
                    emiss = float(emission[em])/cur_count
                else:
                    emiss = -1
                path_probs[tag] = (probs[i-1][prev_tag] * trans * emiss)
            else:
                for prev_tag, prev_count in tags.items():
                    tr = prev_tag+'_'+tag
                    if tr in transition:
                        trans = float(transition[tr])/prev_count
                    else:
                        trans = -1
                    em = tag+'_'+word
                    if em in emission:
                        emiss = float(emission[em])/cur_count
                    else:
                        emiss = -1
                    path_probs[prev_tag] = (probs[i-1][prev_tag] * trans * emiss)
            best_tag = max(path_probs, key=path_probs.get)
            probs[i][tag] = path_probs[best_tag]
            backpt[i][tag] = best_tag
    i += 1
    for tag, cur_count in tags.items():
        tr = tag + '_' + '<finish>'
        if tr in transition:
            trans = float(transition[tr])/cur_count
        else:
            trans = -1
        emiss = 1
        path_probs[tag] = (probs[i-1][tag] * trans * emiss)
        best_tag = max(path_probs, key=path_probs.get)
        probs[i][tag] = path_probs[best_tag]
        backpt[i][tag] = best_tag

    currrent_tag = max(probs[i], key=probs[i].get)
    predicted_tags = []
    for j in range(i,-1,-1):
        predicted_tags.append(currrent_tag)
        currrent_tag = backpt[j][currrent_tag]
    predicted_tags.reverse()
    return predicted_tags[:-1]
Example #15
0
def f90deps(srcs,directory=None):

    files = []
    exts = [ ".f90", ".F90", ".F", ".f", ".f77", ".f03", ".F03", ".f08", ".F08" ]
    for src in srcs:
        fileName, fileExtension = os.path.splitext(src)
        if fileExtension in exts:
            files.append(src)
    srcs = files

    use_line_re  = re.compile(r"^\s*use\s+(\S.+)\s*$",re.IGNORECASE)
    mod_line_re  = re.compile(r"^\s*module\s+(\S+)\s*$",re.IGNORECASE)
    cont_line_re = re.compile(r"^(.*)&\s*$")
    split_re = re.compile(r"\s*,\s*")
    dep_re   = re.compile(r"(.*)")
    mod_re   = re.compile(r"(.*)")

    info = ddict()
    for src in srcs:
        info[src] = f90depinfo()
        if directory is not None and directory != "":
            fh = file(directory + "/" + src,"r")
        else:
            fh = file(src,"r")

        liter = iter(fh)
        while True:
            try:
                line = getline(liter)
                
                has_use = re.match( use_line_re, line )
                has_mod = re.match( mod_line_re, line )
                if has_use is not None:
                    for mod in has_use.group(1).split(","):
                        info[src].uses[ mod.strip() ] = None
                elif has_mod is not None:
                    info[src].provides[ has_mod.group(1).strip() ] = None
            except Exception as e:
#                print "exception: ",e
                break
    modules = ddict()
    for src in srcs:
        for m in info[src].provides:
            modules[m] = src
    for src in srcs:
        tmp = copy.deepcopy( info[src].uses )
        for m in info[src].uses:
            if not m in modules:
                tmp.pop(m,None)
            else:
                tmp[m] = modules[m]
        for m in info[src].provides:
            if m in tmp:
                tmp.pop(m,None)
        info[src].uses = tmp
    return info
 def __init__(self, data):
     self.data = data
     pid_uid_name = ddict(lambda:ddict(lambda:""))
     pid_uid_affi = ddict(lambda:ddict(lambda:""))
     for (pid, uid, name, affi) in data.paperauthor_tuples:
         pid_uid_name[pid][uid] = name
         pid_uid_affi[pid][uid] = affi
     self.pid_uid_name = pid_uid_name
     self.pid_uid_affi = pid_uid_affi
     return
def run_main(submission_dir, submission_list_file, output):
    submissions = [ submission_dir + '/' + x.strip() for x in file(submission_list_file) if len(x.strip()) > 0 and x.strip()[0] != '#' ]
    author_paperrank = ddict(lambda: ddict(float))

    for submission in submissions:
        read_submission(submission, author_paperrank)
    
    with open(output, 'w') as fw:
        print >> fw, 'AuthorId,PaperIds'
        for authorid, paper_score in author_paperrank.items():
            print >> fw, '%d,%s' % (authorid, ' '.join(map(lambda x:str(x[0]), sorted(paper_score.items(), key=lambda x:-x[1]))))
 def train(self, tagged_sentences):
     # count number of times a word is given each tag
     word_tag_counts = ddict(lambda: ddict(lambda: 0))
     for sent in tagged_sentences:
         for (word, tag) in sent:
             word_tag_counts[word][tag] += 1
     # select the tag used most often for the word
     for word in word_tag_counts:
         tag_counts = word_tag_counts[word]
         tag = max(tag_counts, key=tag_counts.get)
         self._word_tags[word] = tag
Example #19
0
File: protein.py Project: bald/p3d
 def __init__(self,pdbfile,mode='3D',chains=None,MaxAtomsPerLeaf=96,DunbrackNaming=False,BSPTree=True):
     if not os.path.exists(pdbfile): 
         raise InputError('File does not exist!')
         sys.exit(1)
     else:
         startTime = time.time()
         self.filename = str(pdbfile)
         self.hash = {}
         self.stats = ddict(int)
         self.geoCentre = p3d.vector.Vector()
         self.massCentre = p3d.vector.Vector()
         self.atoms = [] # storage for all Atom objects
         self.chainTermini = ddict(list)
         self.init_hashes()
         self.resolution = 0.01
         self.leftOvers = []
         self.atomInfoIndex = None
         self.header = []
         self.headers_infos = ddict(list)
         self.conect = []
         # --*-- remove path info from pdb filename --*--
         self.fullname = str(pdbfile) if len(pdbfile.split('/')) == 0 else str(pdbfile.split('/')[-1])
         # --*-- head of family flag used for high throuput to distinguish redundant and non-redundant set --*--
         self.HOF = False
         if len(self.fullname.split('.')[0]) <= 4:
             self.id = self.fullname[:4]
             self.dunbrackChain = None
         elif len(self.fullname.split('.')[0]) == 5 and DunbrackNaming == True:
             # special identifiers in file name, i.e. Dunbrack chain ids
             self.id = self.fullname[:4]
             self.dunbrackChain = self.fullname[4:5].upper()
             if chains == None: chains = list(self.dunbrackChain)
         elif len(self.fullname.split('.')[0]) == 6 and DunbrackNaming == True:
             # special identifiers in filename, i.e. Dunbrack chain id and head of family hook
             if self.fullname[:1] == '_':
                 self.HOF = True
                 self.id = self.fullname[1:5]
                 self.dunbrackChain = self.fullname[5:6]
         else:
             self.id = self.fullname.split('.')[0]
             self.dunbrackChain = None
         if mode == '3D':
             self.BSPTree = p3d.tree.Tree(protein=self)
             self.read_in_3Dstructure(pdbfile,chains=chains)
             if BSPTree:
                 self.BSPTree.build(MaxAtomsPerLeaf=MaxAtomsPerLeaf)
         elif mode == '2D':
             self.read_in_2Dstructure(pdbfile,chains=chains)
         else:
             print("mode error")
             sys.exit(1)
         self.init_parser()
         self.time = round(time.time()-startTime,3)
     return
Example #20
0
	def start(self):
		self.is_started=True
		self.start_time = time.time()
		self.round_duration = len(color_set) * self.interval_duration
		self.pools = ddict(_get_color_pool)
		def color_builder():
			(r, i) = calculate_round_interval(self.start_time, self.interval_duration)
			return self.pools[r].pop()
		self.colors = ddict(lambda: ddict(color_builder))
		# graphics stuff
		self.prog_bar = utils.ProgressBar(self.size[0], self.size[1] / 10.0, 1.0, bg_color=background_color, border_width=5, border_color=(255,69,0))
    def get_intermediate_data(self):
        need_papers_set = get_need_papers(self.data)
        authorpaper_count = ddict(int)
        paper_authorlist = ddict(set)
        for (paperid, authorid, name, affi) in self.data.paperauthor_tuples:
            if paperid in need_papers_set:
                authorpaper_count[(authorid, paperid)] += 1
                paper_authorlist[paperid].add(authorid)

        self.authorpaper_count = authorpaper_count
        self.paper_authorlist = paper_authorlist 
        return 
Example #22
0
 def process_gen_bonus(self, input_file):
     bonus_file = "%s/bonus.txt" % self.result_dir
     months = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
     bonuses = ddict(lambda: ddict(int))
     data = load_data(input_file)
     most_recent_day = max([sum(months[0:int(item[3])])+int(item[4]) for item in data])
     for record in data:
         if most_recent_day - (sum(months[0:int(record[3])])+int(record[4])) < self.bonus:
             user, item = record[0], record[1]
             bonuses[user][item] += 1
     output_mbonus(output_file=bonus_file, bonus=bonuses)
     return bonus_file
 def __init__(self, data):
     self.data = data
     need_authors = get_need_authors(self.data)
     need_papers = get_need_papers(self.data)
     paper_author_name = ddict(lambda: ddict(set))
     for (paperid, authorid, name, affi) in self.data.paperauthor_tuples:
         if paperid in need_papers:
             if len(name) > 0:
                 paper_author_name[paperid][authorid].add(name)
     
     self.paper_author_name = paper_author_name
     return
Example #24
0
    def __init__(self, data):
        self.data = data
        need_papers = get_need_papers(self.data)
        paper_authorlist = ddict(list)
        pa_cnt = ddict(int)
        for (paperid, authorid, name, affi) in data.paperauthor_tuples:
            if paperid in need_papers:
                paper_authorlist[paperid].append((authorid, name))
                pa_cnt[(paperid, authorid)] += 1

        self.paper_authorlist = paper_authorlist
        self.pa_cnt = pa_cnt
def lcseq_len(a, b):
    m = len(a)
    n = len(b)
    c = ddict(lambda:ddict(lambda:0))
    for i in range(1, m+1):
        for j in range(1, n+1):
            if a[i-1] == b[j-1]:
                c[i][j] = c[i-1][j-1] + 1
            elif c[i-1][j] >= c[i][j-1]:
                c[i][j] = c[i-1][j]
            else:
                c[i][j] = c[i][j-1]
    return c[m][n]
    def __init__(self, data):
        self.data = data
        train_authorpaperlist = ddict(list)
        need_authors = set()
        for (authorid, paperid, label) in data.train_tuples:
            train_authorpaperlist[authorid].append(paperid)
            need_authors.add(authorid)
        pa_authorpaperlist = ddict(list)
        for (paperid, authorid, name, affi) in data.paperauthor_tuples:
            if authorid in need_authors:
                pa_authorpaperlist[authorid].append(paperid)

        self.train_authorpaperlist = train_authorpaperlist
        self.pa_authorpaperlist = pa_authorpaperlist
    def get_intermediate_data(self):

        author_deleted_paperset = ddict(set)
        author_confirmed_paperset = ddict(set)
        for (authorid, paperid, label) in self.data.train_tuples:
            if label == 1:
                author_confirmed_paperset[authorid].add(paperid)
            else:
                author_deleted_paperset[authorid].add(paperid)

        need_paper_set = get_need_papers(self.data)
        self.paper_author_list = get_paper_authorlist(self.data, need_paper_set)
        self.author_deleted_paperset = author_deleted_paperset
        self.author_confirmed_paperset = author_confirmed_paperset
        return 
    def __init__(self, data):
        self.data = data
        need_papers = get_need_papers(self.data)
        need_authors = get_need_authors(self.data)
        paper_authorlist = ddict(lambda: ddict(list))
        author_paperlist = ddict(list)
        for (paper, author, name, affi) in data.paperauthor_tuples:
            if paper in need_papers:
                paper_authorlist[paper][author].append((name, affi))
            if author in need_authors:
                author_paperlist[author].append((paper, name, affi))

        self.paper_authorlist = paper_authorlist     
        self.author_paperlist = author_paperlist
        return
Example #29
0
def clusterize_hierarchical(peakels, matrix_dist, cut, clip=False):
    """

    :param clip:
    :param peakels:
    :param matrix_dist:
    :param method:
    :param cut:
    """
    #having negative value in the matrix distance
    # leading to a valueerror
    # clip i order to prevent negative value in the matrix distance
    if clip:
        np.clip(matrix_dist, 0, 1, matrix_dist)
    k = linkage(matrix_dist, method='complete')

    #dist = maxdists(k)
    #fit = norm.fit(dist)
    #cut = np.percentile(dist, 10.0)  #norm.ppf(5.0, loc=fit[0], scale=fit[1])

    k2 = fcluster(k, cut, criterion='distance')  #, criterion='distance')
    clust_by_id = ddict(list)
    for i, v in enumerate(k2):
        clust_by_id[v].append(peakels[i])
    return clust_by_id.values()
Example #30
0
def all_passing_stats(pDict, func, weight = 'weight'):
    stats = ddict(dict)
    for team1 in pDict.keys():
        for team2 in pDict[team1].keys():
            G = df_to_graph(pDict[team1][team2])
            stats[team1][team2] = func(G, weight = weight)
    return stats
Example #31
0
def project_list_submissions(user, event):
    # get roster indexed by wiscmail
    roster = {
        student['net_id'] + "@wisc.edu": student
        for student in get_roster_json() if 'net_id' in student
    }
    project_id = event['project_id']

    if not project_id in get_project_due_utc():
        return (500, 'invalid project id')
    paths = s3().s3_all_keys('projects/' + project_id + '/')

    # email => list of submission IDs
    submissions = ddict(list)
    direct_submissions = ddict(list)

    # emails of students who have recevied test results or CRs on some version of their code
    reviewed = set()
    tested = set()

    for path in paths:
        parts = path.split('/')

        # Example:
        # projects/p2/tylerharter*at*gmail.com/2019-08-16_17-58-37/submission.json (len=5)
        # projects/p2/tylerharter*at*gmail.com/2019-08-16_17-58-37-link.json (len=4)

        if len(parts) > 2:
            email = parts[2].replace("*at*", "@")

        if len(parts) == 5 and parts[-1] == 'submission.json':
            direct_submissions[email].append(parts[3])

        if len(parts) == 5 and parts[-1] == 'cr.json':
            reviewed.add(email)

        if len(parts) == 5 and parts[-1] == 'test.json':
            tested.add(email)

        link_suffix = '-link.json'
        if len(parts) == 4 and parts[-1].endswith(link_suffix):
            submissions[email].append(parts[3][:-len(link_suffix)])

    # we only want links to submissions that have been directly
    # submitted (in contrast to submissions submitted by a partner),
    # so we don't review the same thing twice.
    #
    # we're also only interested in the most recent submission
    rows = []
    for email in direct_submissions:
        latest_direct = max(direct_submissions[email])

        if len(submissions[email]) == 0:
            # this should only happen if lambda crashed between S3 writes, or a submission is withdrawn
            continue

        latest = max(submissions[email])
        if latest_direct != latest:
            continue  # this direct submission has been superceded by a partner's more-recent submission

        row = {
            'project_id': project_id,
            'student_email': email,
            'submission_id': latest_direct,
            'has_review': email in reviewed,
            'tested': email in tested,
            'info': {},
        }

        # supplement with info from roster
        for field in ['net_id', 'ta']:
            row['info'][field] = roster.get(email, {}).get(field, None)

        rows.append(row)

    return (200, {
        'submissions': rows,
        'reviewed': list(reviewed),
        'direct': list(direct_submissions.keys())
    })
Example #32
0
puquxi hmeaehh oxe tasipw qzyg hyvy wcmpwe
hvs fxq wvfy zjepsl dvrfxnc xnvg
xle crcuc qkhnv crcuc oedez bjw pmwq
xzzpiy cjwss jwscs apb bpa
ydjhhf yeltadb lwi cjdcb ovaox xrdm vkxub
zax xza admbc lvpzfeh auxn rwasj
kebx eild nrskdr meja jxczomh gcne"""

inData = """
abcde fghij
abcde xyz ecdab
a ab abc abd abf abj
iiii oiii ooii oooi oooo
oiii ioii iioi iiio
"""

myDict = ddict(lambda: 0)

validNo = 0

for pwd in inData.split('\n'):
    myDict.clear()
    validPwd = True
    for part in pwd.split(' '):
        myDict[part] += 1
        if myDict[part] > 1:
            validPwd = False
    if validPwd:
        validNo += 1
print(validNo)
Example #33
0
    def postflight(self):
        '''
        read the output and merge in back to the ident csv
        '''

        potential_buggy_percolator_output = self.params['translations'][
            'percolator_out'] + '.psms'
        if os.path.exists(potential_buggy_percolator_output):
            print('WTF Percolator ?')
            print('Renaming: \n{percolator_out}.psms ->> {percolator_out}'.
                  format(**self.params['translations']))
            os.rename(
                self.params['translations']['output_file_incl_path'] +
                '.psms.psms',
                self.params['translations']['output_file_incl_path'] + '.psms')
            os.rename(
                self.params['translations']['decoy_output_file_incl_path'] +
                '.psms.psms',
                self.params['translations']['decoy_output_file_incl_path'] +
                '.psms')
            os.rename(
                self.params['translations']['output_file_incl_path'] +
                '.psms.peptides',
                self.params['translations']['output_file_incl_path'] +
                '.peptides')

        s2l = {'target': ddict(list), 'decoy': ddict(list)}
        for pkey, p_out in [('target', 'percolator_out'),
                            ('decoy', 'percolator_decoy_out')]:

            percolator_output_dict_reader = csv.DictReader(open(
                self.params['translations'][p_out], 'r'),
                                                           delimiter='\t')
            for line_dict in percolator_output_dict_reader:

                peptide = line_dict['peptide'].split('.')[1]
                psmid_pep_key = (
                    line_dict['PSMId'],
                    peptide,
                )
                if psmid_pep_key not in s2l[pkey].keys():
                    s2l[pkey][psmid_pep_key] = line_dict

        opened_file = open(self.params['translations']['csv_input_file'], 'r')
        csv_input = csv.DictReader(row for row in opened_file
                                   if not row.startswith('#'))

        if "PEP" not in csv_input.fieldnames and "q-value" not in csv_input.fieldnames:
            csv_input.fieldnames += ['PEP', 'q-value']
        csv_kwargs = {}

        if sys.platform == 'win32':
            csv_kwargs['lineterminator'] = '\n'
        else:
            csv_kwargs['lineterminator'] = '\r\n'

        csv_output = csv.DictWriter(
            open(self.params['translations']['output_file_incl_path'], 'w'),
            csv_input.fieldnames, **csv_kwargs)

        csv_output.writeheader()
        for line_dict in csv_input:

            # check if the current line is a decoy or a target
            # so we know in which percolator output file we have to look for it.
            psm_type = "target"
            if line_dict['Is decoy'].upper() == 'TRUE':
                psm_type = "decoy"
            # if self.params['translations']['decoy_tag'] in line_dict['proteinacc_start_stop_pre_post_;']:
            #     line_dict['Is decoy'] = "true"
            #     psm_type = "decoy"
            if line_dict['Modifications'].strip() != '':
                seq_and_mods = '#'.join(
                    [line_dict['Sequence'], line_dict['Modifications']])
            else:
                seq_and_mods = line_dict['Sequence']
            _psmid_pep_key = (
                line_dict['Spectrum Title'],
                seq_and_mods,
            )
            if _psmid_pep_key in s2l[psm_type].keys():
                line_dict['PEP'] = s2l[psm_type][_psmid_pep_key][
                    'posterior_error_prob']
                line_dict['q-value'] = s2l[psm_type][_psmid_pep_key]['q-value']
                # write all results including decoy into the full csv:
                csv_output.writerow(line_dict)
            else:
                print(
                    'Original PSM :{0} could not be found in percolator output file, most probably because PSM was filtered by percolator, (multiple peptides to one spectrum match)'
                    .format(_psmid_pep_key))
Example #34
0
def main(input_file=None, decoy_tag=None, output_file=None):
    '''
    Converts xTandem.xml files into .csv
    We need to do this on our own, because mzidentml_lib 
    reports wrong positions for modifications
    (and it is also not able to convert the piledriver.mzid into csv)

    It should be noted that
    - xtandem groups are not merged (since it is not the same as protein groups)
    - multiple domains (multiple occurence of a peptide in the same protein) are not reported
    
    '''
    NEW_HEADERS = [
        'Raw data location',
        'Spectrum ID',
        'Spectrum Title',
        'Retention Time (s)',
        'rank',
        'Calc m/z',
        'Exp m/z',
        'Charge',
        'Sequence',
        'Modifications',
        'X\!Tandem:expect',
        'X\!Tandem:hyperscore',
        'proteinacc_start_stop_pre_post_;',
        'Is decoy',
    ]
    PROTON = 1.00727646677
    protein = None
    group = None
    group_counter = 0
    protein_groups = []

    csvOut = csv.DictWriter(open(output_file, 'w', newline=''), NEW_HEADERS)
    csvOut.writeheader()

    print("Converting XTandem XML into CSV: {0}".format(input_file))
    tandemXML = iter(
        cElementTree.iterparse(input_file, events=(b'start', b'end')))

    for pos, (event, element) in enumerate(tandemXML):
        if event == b'start':
            if element.tag.endswith('bioml'):
                raw_data_location = element.attrib['label'].split("'")[1]
            if element.tag.endswith('group'):
                if 'mh' in element.attrib.keys():
                    group_counter = 0
                    notes = ddict(set)
                    notes_key = None
                    group = {}  # reset group
                    group['Raw data location'] = raw_data_location
                    group['Charge'] = element.attrib['z']
                    group['Exp m/z'] = (float(element.attrib['mh']) /
                                        float(group['Charge'])) + PROTON
                    group['X\!Tandem:expect'] = element.attrib['expect']
                    protein_groups = []
                    domain_groups = []
                if element.attrib['label'] == 'fragment ion mass spectrum':
                    notes_key = 'spectrum'
                if element.attrib['label'] == 'input parameters':
                    notes_key = 'parameters'
                group_counter += 1

            elif element.tag.endswith('protein'):
                protein = {}  # reset protein
                notes_key = 'protein'

            elif element.tag.endswith('domain'):
                if 'seq' in element.attrib.keys():
                    domain = {header: ''
                              for header in NEW_HEADERS}  # reset domain
                    domain['Sequence'] = element.attrib['seq']
                    domain['X\!Tandem:hyperscore'] = element.attrib[
                        'hyperscore']
                    domain['Calc m/z'] = (float(element.attrib['mh']) /
                                          float(group['Charge'])) + PROTON
                    protein['Start'] = element.attrib['start']
                    protein['Stop'] = element.attrib['end']
                    protein['pre'] = element.attrib['pre'][-1]
                    protein['post'] = element.attrib['post'][0]
                    if protein['Start'] == '1':
                        protein['pre'] = '-'
                    if protein['post'] == ']':
                        protein['post'] = '-'
                    domain[
                        'proteinacc_start_stop_pre_post_;'] = '{0}_{1}_{2}_{3}_{4};'.format(
                            protein['Defline'],
                            protein['Start'],
                            protein['Stop'],
                            protein['pre'],
                            protein['post'],
                        )
                    if protein['Defline'].startswith(decoy_tag):
                        domain['Is decoy'] = True
                    else:
                        domain['Is decoy'] = False
                    domain['rank'] = 1

            elif element.tag.endswith('aa'):
                if 'modified' not in element.attrib.keys():
                    continue
                mod = '{0}:{1}'.format(
                    element.attrib['modified'],
                    int(element.attrib['at']) - int(protein['Start']) + 1)
                try:
                    domain['Modifications'].append(mod)
                except:
                    domain['Modifications'] = [mod]
        else:
            if element.tag.endswith('protein'):
                protein_groups.append(protein)
            if element.tag.endswith('domain'):
                for k, v in domain.items():
                    if k not in protein.keys():
                        protein[k] = v
                # domain_groups.append( domain )
            elif element.tag.endswith('note'):
                if notes_key == 'spectrum':
                    group['Spectrum Title'] = element.text.strip()
                elif notes_key == 'protein':
                    protein['Defline'] = element.text.strip()
            elif element.tag.endswith('group'):
                group_counter -= 1
                if group_counter == 0:
                    dict2write = {}
                    # proteins = []
                    for protein_group in protein_groups:
                        for key in NEW_HEADERS:
                            # if key == 'proteinacc_start_stop_pre_post_;':
                            #     proteins.append(protein_group[key])
                            #     continue
                            if key == 'Modifications' and protein_group[
                                    key] != '':
                                dict2write[key] = ';'.join(protein_group[key])
                                continue
                            try:
                                dict2write[key] = group[key]
                            except:
                                dict2write[key] = protein_group[key]
                        # dict2write['proteinacc_start_stop_pre_post_;'] = ''.join(proteins)
                        csvOut.writerow(dict2write)
                    protein_groups = []
    return
Example #35
0
    else:
        next_state = current_state + direction_deltas[action_indx]
    # Boundaries
    if next_state[0] < 0:
        next_state[0] = 0
    if next_state[0] > 39:
        next_state[0] = 39
    if next_state[1] < 0:
        next_state[1] = 0
    if next_state[1] > 39:
        next_state[1] = 39
    return next_state


# Exploration
sar_dict = ddict(list)
set_of_episode_traces = []
for ep_n in range(exploration_episode_number):
    ###
    current_state = [0, 39, 1]
    iter_number = 1
    episode_trace = ['start']
    start_time = time.time()
    while reward(episode_trace, layout_dict) < 9 and iter_number < max_it_number:
        iter_number += 1
        action = random.randint(0, 3)
        next_state_2d = list(take_action(current_state[0:-1], action))
        next_state_label = layout[next_state_2d[0]][next_state_2d[1]]
        if next_state_label != 0:  # ignoring the neutral label
            episode_trace.append(next_state_label)
            # ### SYNTH ### #
Example #36
0
    def __init__(
        self,
        path=None,
        noiseThreshold=0.0,
        extraAccessions=None,
        MS1_Precision=5e-6,
        MSn_Precision=20e-6,
        build_index_from_scratch=False,
        file_object=None,
        obo_version=None,
    ):
        # self.param contains user-specified parsing parameters
        self.param = dict()

        self.param['noiseThreshold'] = noiseThreshold
        self.param['MS1_Precision'] = MS1_Precision
        self.param['MSn_Precision'] = MSn_Precision
        self.param['accessions'] = {}

        # self.info contains information extracted from the mzML file
        self.info = dict()

        self.info['offsets'] = ddict()
        self.info['offsetList'] = []
        self.info['referenceableParamGroupList'] = False

        self.info['spectrum_count'] = 0
        self.info['chromatogram_count'] = 0

        self.info['obo_version'] = obo_version

        self.info['encoding'] = None

        self.MS1_Precision = MS1_Precision

        self.elementList = []

        # Default stuff

        # Can actually be either a spectrum _or_ a chromatogram; the Spectrum
        # class supports both
        self.spectrum = pymzml.spec.Spectrum(
            measuredPrecision=MS1_Precision,
            param=self.param,
        )
        self.spectrum.clear()

        assert path is not None or file_object is not None, \
            'Must provide either a path or a file object to parse'

        self.info['fileObject'], self.info['seekable'] = self.__open_file(
            path, file_object)
        self.info['filename'] = path

        if self.info['seekable']:
            # Seekable files can use the index for random access
            self.seeker = self._build_index(build_index_from_scratch)

        self.iter = self.__init_iter()
        self.OT = self.__init_obo_translator(extraAccessions)
        return
Example #37
0
    def _parse_sequence_unimod_style(self, sequence):
        minPos = sequence.index("#")
        peptide = sequence[:minPos]
        addon = sequence[minPos + 1:]
        self.peptide = peptide
        if peptide != '':
            self.add_peptide(peptide)
            self['O'] += 1
            self['H'] += 2
        self.addon = addon
        unimods = self.addon.split(';')
        # pattern = self.regex_patterns[':pos']
        pattern = re.compile(r''':(?P<pos>[0-9]*$)''')
        for unimod in unimods:
            if unimod == '':
                continue
            unimod = unimod.strip()
            if ':' not in unimod:
                sys.exit(
                    'This unimod: {0} requires positional information'.format(
                        unimod))

            for occ, match in enumerate(pattern.finditer(unimod)):
                try:
                    unimodcomposition = self._unimod_parser.name2composition(
                        unimod[:match.start()])
                except:
                    sys.exit(
                        'Cannot map unimod {0}. extracted position argument {1}'
                        .format(unimod, match.start()))
                # if occ >= 1:
                position = int(match.group('pos'))
                if position in self.unimod_at_pos.keys():
                    print('{0} <<- Two unimods at the same position ? '.format(
                        sequence))
                    raise Exception

                self.unimod_at_pos[position] = unimod[:match.start()]
            # match = re.search( position_re_pattern, unimod)
            # if match is not None:
            #     end = match.start()
            #     print( '>>>>', match)
            # else:
            #     end = len( unimod )
            # try:
            #     unimodcomposition = self._unimod_parser.name2composition(
            #         unimod[:end ]
            #     )
            # except:
            #     print(
            #         'Unimod error:', unimod,'>>', unimod[:end],
            #         re.search( position_re_pattern , unimod),
            #         re.search( position_re_pattern , unimod).start()
            #     )
            #     exit(1)
            # print( self , 'peptide only')
            # print( 'Unimod:', unimod, unimod[:end] , )
            # Full addition
            # print( unimodcomposition , '<<<<<<')
            for k, v in unimodcomposition.items():
                self[k] += v
            # storage position related modifications
            position = int(match.group('pos'))
            if position == 0:
                # E.g. Acetylation at pos 0 indicates N-Term
                # but has to be counted for position 1 in this class
                position = 1

            if position not in self.composition_of_mod_at_pos.keys():
                self.composition_of_mod_at_pos[position] = ddict(int)
            if position not in self.composition_at_pos.keys():
                self.composition_at_pos[position] = ddict(int)
            for k, v in unimodcomposition.items():
                self.composition_of_mod_at_pos[position][k] += v
                self.composition_at_pos[position][k] += v

        return
 def __init__(self, responses, vectors):
     self.type_vectors = ddict(int)  # FastText vector file
     self.res_nvec = {}  # Responses-Normalised vectors
     self.load_vectors(vectors)
     self.normalize_responses(responses)
Example #39
0
def main():
    '''
    Example script to do a fragment mass tolerance parameter sweep.

    usage:
        ./bsa_fragment_mass_tolerance_example.py

    If the fragment mass tolerance becomes to small, ver few peptides are found.
    With this small sweep the actual min accuracy of a mass spectrometer can
    be estimated.

    '''
    fragment_mass_tolerance_list = [
        0.02,
        0.04,
        0.06,
        0.08,
        0.1,
        0.2,
        0.3,
        0.4,
        0.5,
    ]

    engine_list = ['xtandem_vengeance']

    R = ursgal.UController(
        profile='LTQ XL low res',
        params={
            'database':
            os.path.join(os.pardir, 'example_data', 'BSA.fasta'),
            'modifications': [
                'M,opt,any,Oxidation',  # Met oxidation
                'C,fix,any,Carbamidomethyl',  # Carbamidomethylation
                '*,opt,Prot-N-term,Acetyl'  # N-Acteylation
            ],
        })

    mzML_file = os.path.join(os.pardir, 'example_data',
                             'BSA_fragment_mass_tolerance_example',
                             'BSA1.mzML')
    if os.path.exists(mzML_file) is False:
        R.params[
            'http_url'] = 'http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw'
        R.params['http_output_folder'] = os.path.dirname(mzML_file)
        R.fetch_file(engine='get_http_files_1_0_0')
        try:
            shutil.move('{0}?format=raw'.format(mzML_file), mzML_file)
        except:
            shutil.move('{0}format=raw'.format(mzML_file), mzML_file)

    # Convert mzML to MGF outside the loop, so this step is not repeated in the loop
    mgf_file = R.convert_to_mgf_and_update_rt_lookup(input_file=mzML_file)

    for engine in engine_list:
        for fragment_mass_tolerance in fragment_mass_tolerance_list:

            R.params['frag_mass_tolerance'] = fragment_mass_tolerance

            R.params['prefix'] = '{0}_fragment_mass_tolerance_'.format(
                fragment_mass_tolerance)

            unified_search_result_file = R.search(
                input_file=mgf_file,
                engine=engine,
                force=False,
            )

    collector = ddict(set)
    for csv_path in glob.glob('{0}/*/*unified.csv'.format(
            os.path.dirname(mzML_file))):
        for line_dict in csv.DictReader(open(csv_path, 'r')):
            collector[csv_path].add(line_dict['Sequence'])
    for csv_path, peptide_set in sorted(collector.items()):
        file_name = os.path.basename(csv_path)
        tolerance = file_name.split('_')[0]
        print(
            'Search with {0: <4} Da fragment mass tolerance found {1: >2} peptides'
            .format(tolerance, len(peptide_set)))
    return
Example #40
0
#==============================================#
#         Created By: Svess#8004               #
#  Last Modification:  2021-03-03 10:33 UTC+0  #
#==============================================#

# I used this to get the list of all different kakera reaction emotes so I could go through them and see if they were used by the bot at some point

from collections import defaultdict as ddict
import json

files = ["flood-7.json"]

Mudae = "432610292342587392"
Mudamaid2 = "488711695640821760"

emotes = ddict(str)

dct = ddict(int)
for File in files:
    searchfile = open(File, )
    data = json.load(searchfile)

    for message in data['messages']:
        if message["author"]["id"] == Mudamaid2:
            if message["reactions"] != []:
                for i in message["reactions"]:
                    dct[i["emoji"]["id"]] += 1
                    emotes[i["emoji"]["id"]] = i["emoji"]["name"]

    searchfile.close()
Example #41
0
from sys import stdin
from collections import defaultdict as ddict
dic = True
mapp = ddict(lambda: 'eh')
for i in stdin:
    if dic:
        j = i.split()
        if len(j) == 0:
            dic = False
            continue
        mapp[j[1]] = j[0]
    else:
        print(mapp[i.strip()])
Example #42
0
################################################
#                  DISCLAIMER                  #
#   This code is not good for 1 major reason   #
#     We are unable to see if the reactions    #
#     are on messages from the Mudae BOTs      #
#   So any reaction on any message is counted  #
#       and therefore this code will not       #
#      return a close to correct resault       #
################################################

# Imports
from collections import defaultdict as ddict

# Initialising a default dict to store the reactions
dct = ddict(int)

# List of files that the program will read through
# Should be .txt files
# We recommend compiling your data using DiscordChatExporter
files = [

    # Dataset 1
    # 2021-03-15 22:00 UTC - 2021-03-31 07:26 UTC
    # After Light kakera was added

    #"resources-1/flood-1.txt",
    #"resources-1/flood-2.txt",
    #"resources-1/flood-3.txt",
    #"resources-1/flood-4.txt",
    #"resources-1/flood-5.txt",
def analyze(folder):
    '''

    Parses the result files form search and write a result .csv file which
    contains the data to plot figure 2.

    '''

    R = ursgal.UController(
        profile='QExactive+',
        params=GENERAL_PARAMS
    )
    csv_collector = {}
    ve = 'qvality_2_02'

    sample_regex_pattern = 'sample\d_R0\d'

    sample_2_x_pos_and_mq_offset = {}

    sample_offset_combos = []

    all_tested_offsets = [str(n) for n in range(-20, 21, 2)]

    for pos, (mq_ppm_off, mzML_file) in enumerate(MQ_OFFSET_TO_FILENAME):
        _sample = re.search(sample_regex_pattern, mzML_file).group()
        sample_2_x_pos_and_mq_offset[_sample] = (pos, mq_ppm_off)
        for theo_offset in all_tested_offsets:
            sample_offset_combos.append((_sample, theo_offset))

    for csv_path in glob.glob(os.path.join('{0}'.format(folder), '*', '*_unified.csv')):
        dirname = os.path.dirname(csv_path)
        sample = re.search(sample_regex_pattern, csv_path).group()
        splitted_basename = os.path.basename(csv_path).split('_')
        offset = splitted_basename[2]
        precursor_ion_tolerance = splitted_basename[4]
        frag_ion_tolerance = splitted_basename[6]
        prefix = '_'.join(splitted_basename[:7])

        R.params['machine_offset_in_ppm'] = offset
        R.params['precursor_mass_tolerance_minus'] = precursor_ion_tolerance
        R.params['precursor_mass_tolerance_plus'] = precursor_ion_tolerance
        R.params['frag_mass_tolerance'] = frag_ion_tolerance
        R.params['prefix'] = prefix

        validated_path = csv_path.replace(
            '_unified.csv',
            '_{0}_validated.csv'.format(ve)
        )
        if os.path.exists(validated_path):
            csv_path = validated_path
        else:
            try:
                csv_path = R.validate(
                    input_file=csv_path,
                    engine=ve
                )
            except:
                continue

        pit_fit = (precursor_ion_tolerance, frag_ion_tolerance)

        if pit_fit not in csv_collector.keys():
            csv_collector[pit_fit] = ddict(set)

        csv_key = (sample, offset)

        print('Reading file: {0}'.format(csv_path))
        for line_dict in csv.DictReader(open(csv_path, 'r')):
            if line_dict['Is decoy'] == 'true':
                continue
            if float(line_dict['PEP']) <= 0.01:
                csv_collector[pit_fit][csv_key].add(
                    '{0}{1}'.format(
                        line_dict['Sequence'],
                        line_dict['Modifications']
                    )
                )

    fieldnames = [
        'Sample',
        'pos',
        'MQ_offset',
        'tested_ppm_offset',
        'peptide_count'
    ]

    outfile_name_format_string = 'bruderer_data_ppm_sweep_precursor_mass_tolerance_{0}_fragment_mass_tolerance_{1}.csv'

    for pit_fit in csv_collector.keys():
        with open(outfile_name_format_string.format(*pit_fit), 'w') as io:
            csv_writer = csv.DictWriter(io, fieldnames)
            csv_writer.writeheader()

            # write missing values
            for sample_offset in sample_offset_combos:
                sample, ppm_offset = sample_offset
                if sample_offset not in csv_collector[pit_fit].keys():
                    dict_2_write = {
                        'Sample': sample,
                        'pos': sample_2_x_pos_and_mq_offset[sample][0],
                        'MQ_offset': '',
                        'tested_ppm_offset': ppm_offset,
                        'peptide_count': 0,
                    }
                    csv_writer.writerow(dict_2_write)

            for (sample, ppm_offset), peptide_set in csv_collector[pit_fit].items():
                dict_2_write = {
                    'Sample': sample,
                    'pos': sample_2_x_pos_and_mq_offset[sample][0],
                    'MQ_offset': sample_2_x_pos_and_mq_offset[sample][1] * -1,
                    'tested_ppm_offset': ppm_offset,
                    'peptide_count': len(peptide_set),
                }
                csv_writer.writerow(dict_2_write)
    return
Example #44
0
    def create_year2id(self, triple_time):
        year2id = dict()
        freq = ddict(int)
        count = 0
        year_list = []

        for k, v in triple_time.items():
            try:
                start = v[0].split('-')[0]
                end = v[1].split('-')[0]
            except:
                pdb.set_trace()

            if start.find('#') == -1 and len(start) == 4:
                year_list.append(int(start))
            if end.find('#') == -1 and len(end) == 4:
                year_list.append(int(end))

        # for k,v in entity_time.items():
        # 	start = v[0].split('-')[0]
        # 	end = v[1].split('-')[0]

        # 	if start.find('#') == -1 and len(start) == 4: year_list.append(int(start))
        # 	if end.find('#') == -1 and len(end) ==4: year_list.append(int(end))
        # 	# if int(start) > int(end):
        # 	# 	pdb.set_trace()

        year_list.sort()
        for year in year_list:
            freq[year] = freq[year] + 1

        year_class = []
        count = 0
        for key in sorted(freq.keys()):
            count += freq[key]
            if count > 300:
                year_class.append(key)
                count = 0
        prev_year = 0
        i = 0
        for i, yr in enumerate(year_class):
            year2id[(prev_year, yr)] = i
            prev_year = yr + 1
        year2id[(prev_year, max(year_list))] = i + 1
        self.year_list = year_list

        # for k,v in entity_time.items():
        # 	if v[0] == '####-##-##' or v[1] == '####-##-##':
        # 		continue
        # 	if len(v[0].split('-')[0])!=4 or len(v[1].split('-')[0])!=4:
        # 		continue
        # 	start = v[0].split('-')[0]
        # 	end = v[1].split('-')[0]
        # for start in start_list:
        # 	if start not in start_year2id:
        # 		start_year2id[start] = count_start
        # 		count_start+=1

        # for end in end_list:
        # 	if end not in end_year2id:
        # 		end_year2id[end] = count_end
        # 		count_end+=1

        return year2id
Example #45
0
import pickle
import json
from collections import defaultdict as ddict
from collections import Counter
import os.path as osp
import numpy.random as npr
import numpy as np
from pprint import pprint

CityDic = ddict(set)
target = "business_id"
sfile = "CityRest.pkl"
lfile = "business.json"

if not osp.isfile(sfile):
    for idx, oneobs in enumerate(open(lfile, "r")):
        print(f"Current index is {idx}")
        oneobs = json.loads(oneobs)
        CityDic[oneobs["city"]].add(oneobs[target])
        if idx == 1000:
            pass  #break

    with open(sfile, "wb") as f:
        pickle.dump(CityDic, f)

with open(sfile, "rb") as f:
    CityDic = pickle.load(f)

CityC = Counter()

for key, dat in CityDic.items():
def load_data(dataset_str, args):
        
        names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
        objects = []
        if 'nell' in dataset_str:
                data_dict = pickle.load(open('./data/{}_data.pkl'.format(dataset_str), 'rb'), encoding='latin1')
                x, y, tx, ty, allx, ally, graph = data_dict['x'], data_dict['y'], data_dict['tx'], data_dict['ty'], data_dict['allx'], data_dict['ally'], data_dict['graph']

                index = list(range(allx.shape[0])) + data_dict['test.index']
                remap = {x: x for x in range(allx.shape[0])}
                remap.update({i+allx.shape[0]: x for i, x in enumerate(data_dict['test.index'])})
                remap_inv = {v: k for k, v in remap.items()}

                graph_new = ddict(list)
                for key, val in graph.items():
                        if key not in remap_inv: continue
                        graph_new[remap_inv[key]] = [remap_inv[v] for v in val if v in remap_inv]

                graph = graph_new
                test_idx_reorder = [remap_inv[x] for x in data_dict['test.index']]
        else:
                for i in range(len(names)):
                        with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
                                if sys.version_info > (3, 0):
                                        objects.append(pickle.load(f, encoding='latin1'))
                                else:
                                        objects.append(pickle.load(f))

                x, y, tx, ty, allx, ally, graph = tuple(objects)
                test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))

        test_idx_range = np.sort(test_idx_reorder)

        if dataset_str == 'citeseer':
                # Fix citeseer dataset (there are some isolated nodes in the graph)
                # Find isolated nodes, add them as zero-vecs into the right position
                test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
                tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
                tx_extended[test_idx_range-min(test_idx_range), :] = tx
                tx = tx_extended
                ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
                ty_extended[test_idx_range-min(test_idx_range), :] = ty
                ty = ty_extended

        features = sp.vstack((allx, tx)).tolil()
        features[test_idx_reorder, :] = features[test_idx_range, :]
        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

        labels = np.vstack((ally, ty))
        labels[test_idx_reorder, :] = labels[test_idx_range, :]

        idx_test = test_idx_range.tolist()
        idx_train = range(len(y))
        idx_val = range(len(y), len(y)+500)

        train_mask = sample_mask(idx_train, labels.shape[0])
        val_mask   = sample_mask(idx_val, labels.shape[0])
        test_mask  = sample_mask(idx_test, labels.shape[0])

        y_train = np.zeros(labels.shape)
        y_val   = np.zeros(labels.shape)
        y_test  = np.zeros(labels.shape)
        y_train[train_mask, :] = labels[train_mask, :]
        y_val[val_mask, :] = labels[val_mask, :]
        y_test[test_mask, :] = labels[test_mask, :]

        return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
Example #47
0
import json, urllib, boto3, botocore, base64, time, traceback, random, string
from collections import defaultdict as ddict

ROUTES = {}
EXTRA_AUTH = ddict(list)
BUCKET = 'caraza-harter-cs301'
ADMIN_EMAIL = '*****@*****.**'
INSTRUCTOR_EMAILS = ['*****@*****.**', '*****@*****.**']
GRADER_EMAILS = [
    '*****@*****.**',
    '*****@*****.**',
    '*****@*****.**',
    '*****@*****.**',
    '*****@*****.**',
    '*****@*****.**',
    '*****@*****.**',
    '*****@*****.**',
    '*****@*****.**',
    '*****@*****.**',
    '*****@*****.**',
]

s3_cache = None # client

def s3():
    # cache S3 client
    global s3_cache
    if s3_cache == None:
        s3_cache = boto3.client('s3')
    return s3_cache
Example #48
0
def analyze(collector):
    """
    Simle analysis script for the cascade search,
    counting the number of identified peptides (combination of peptide sequence and modifications)
    and PSMs (additionally include the spectrum ID)
    """

    mod_list = ["Oxidation", "Deamidated", "Methyl", "Acetyl", "Phospho"]
    fieldnames = ([
        "approach", "count_type", "validation_engine", "unmodified",
        "multimodified"
    ] + mod_list + ["total"])

    csv_writer = csv.DictWriter(open("cascade_results.csv", "w"), fieldnames)
    csv_writer.writeheader()
    uc = ursgal.UController()
    uc.params["validation_score_field"] = "PEP"
    uc.params["bigger_scores_better"] = False

    # Count the number of identified peptides and PSMs for the different modifications
    # Spectra with multiple PSMs are sanitized, i.e. only the PSM with best PEP score is counted
    # and only if the best hit has a PEP that is at least two orders of
    # magnitude smaller than the others
    for validation_engine, result_file in collector.items():
        counter_dict = {"psm": ddict(set), "pep": ddict(set)}
        grouped_psms = uc._group_psms(result_file,
                                      validation_score_field="PEP",
                                      bigger_scores_better=False)
        for spec_title, grouped_psm_list in grouped_psms.items():
            best_score, best_line_dict = grouped_psm_list[0]
            if len(grouped_psm_list) > 1:
                second_best_score, second_best_line_dict = grouped_psm_list[1]
                best_peptide_and_mod = (best_line_dict["Sequence"] +
                                        best_line_dict["Modifications"])
                second_best_peptide_and_mod = (
                    second_best_line_dict["Sequence"] +
                    second_best_line_dict["Modifications"])

                if best_peptide_and_mod == second_best_peptide_and_mod:
                    line_dict = best_line_dict
                elif best_line_dict["Sequence"] == second_best_line_dict[
                        "Sequence"]:
                    if best_score == second_best_score:
                        line_dict = best_line_dict
                    else:
                        if (-1 * math.log10(best_score)) - (
                                -1 * math.log10(second_best_score)) >= 2:
                            line_dict = best_line_dict
                        else:
                            continue
                else:
                    if (-1 * math.log10(best_score)) - (
                            -1 * math.log10(second_best_score)) >= 2:
                        line_dict = best_line_dict
                    else:
                        continue
            else:
                line_dict = best_line_dict

            count = 0
            for mod in mod_list:
                if mod in line_dict["Modifications"]:
                    count += 1
            key_2_add = ""
            if count == 0:
                key_2_add = "unmodified"
            elif count >= 2:
                key_2_add = "multimodified"
            elif count == 1:
                for mod in mod_list:
                    if mod in line_dict["Modifications"]:
                        key_2_add = mod
                        break
            # for peptide identification comparison
            counter_dict["pep"][key_2_add].add(line_dict["Sequence"] +
                                               line_dict["Modifications"])
            # for PSM comparison
            counter_dict["psm"][key_2_add].add(line_dict["Spectrum Title"] +
                                               line_dict["Sequence"] +
                                               line_dict["Modifications"])
        for counter_key, count_dict in counter_dict.items():
            dict_2_write = {
                "approach": "cascade",
                "count_type": counter_key,
                "validation_engine": validation_engine,
            }
            total_number = 0
            for key, obj_set in count_dict.items():
                dict_2_write[key] = len(obj_set)
                total_number += len(obj_set)
            dict_2_write["total"] = total_number
            csv_writer.writerow(dict_2_write)
    return
Example #49
0
    def __init__(self, filename=None, run=None, overwrite=False):

        cElementTree.register_namespace("", "http://psi.hupo.org/ms/mzml")
        self.filename = filename
        self.lookup = {}

        self.newTree = None
        self.TreeBuilder = cElementTree.TreeBuilder()
        self.run = run
        self.info = {'counters': ddict(int)}

        if self.run.info['filename'].endswith('.gz'):
            import gzip
            import codecs
            io = codecs.getreader("utf-8")(gzip.open(
                self.run.info['filename']))
        else:
            io = open(self.run.info['filename'], 'r')

        #read the rest as original file
        input_xml_string = ''
        pymzml_tag_written = False
        #open again to read as text!
        for line in open(self.run.info['filename'], 'r').readlines():
            if 'indexedmzML' in line:
                # writing of indexed mzML is not possible at the moment
                continue
            if 'run' in line:
                # the run is appended from the original parser to avoid messing
                # with the new xml tree, we break before the run data starts
                break

            input_xml_string += line
            if 'softwareList' in line and pymzml_tag_written is False:
                addon = cElementTree.Element('software', {
                    'id': 'pymzML',
                    'version': "0.7.6"
                })
                cElementTree.SubElement(
                    addon, 'cvParam', {
                        'accession': 'MS:1000531',
                        'cvRef': 'MS',
                        'name': 'pymzML Writer',
                        'version': '0.7.6',
                    })
                new_line = cElementTree.tostring(addon, encoding='utf-8')
                input_xml_string += new_line
                pymzml_tag_written = True
        input_xml_string += '</mzML>\n'

        self.newTree = cElementTree.fromstring(input_xml_string)

        for event, element in cElementTree.iterparse(io,
                                                     events=(b'start',
                                                             b'end')):
            if event == b'start':
                if element.tag.endswith('}run'):
                    self.lookup['run'] = cElementTree.Element(
                        element.tag, element.attrib)
                if element.tag.endswith('}spectrumList'):
                    self.lookup['spectrumList'] = \
                        cElementTree.Element(element.tag, element.attrib)
                    self.lookup['spectrumIndeces'] = \
                        cElementTree.Element('index', {'name': 'spectrum'})
                    break
        return
def get_training_data(dbfile, V_dbfile=None):
    """
    This function gets training data from a database returns training and test data
    Args:
        dbfile: the base SQLite db file with the training data
        V_dbfile: an optional held out validation SQLite db file
    """
    db = sqlite3.connect(dbfile)
    cursor = db.cursor()
    if V_dbfile:
        V_db = sqlite3.connect(V_dbfile)
        V_cursor = V_db.cursor()
    split_method = args.split_method
    query = ""
    if split_method == 'entity':
        cursor.execute("""
        select raw, entity_id, label
        from string
        join entity on entity_id = entity.id
        join uniquestring on uniquestring_id = uniquestring.id
        """)
        data = list(cursor.fetchall())[:args.max_samples]
        entity_strings = ddict(list)
        for raw, eid, label in data:
            if len(entity_strings[(eid, label)]) < 100:
                entity_strings[(eid, label)].append(raw)
        entity_strings = entity_strings.items()
        random.shuffle(entity_strings)
        datalen = len(entity_strings)
        splitidx = int(args.split_ratio * datalen)

        train_strings, test_strings = entity_strings[:
                                                     splitidx], entity_strings[
                                                         splitidx:]
        train_X = []
        train_y = []
        test_X = []
        test_y = []

        for (eid, label), strings in train_strings:
            for string in strings:
                train_X.append(string)
                train_y.append(int(label))

        for (eid, label), strings in test_strings:
            for string in strings:
                test_X.append(string)
                test_y.append(int(label))

        return train_X, test_X, train_y, test_y
    elif split_method == 'unique':

        def get_unique_data(target_cursor, max_samples, skipset=None):
            target_cursor.execute("""
            select raw, p_malware
            from uniquestring 
            """)
            strings = []
            labels = []
            rows = list(target_cursor.fetchall())
            random.shuffle(rows)

            for string, p_malware in rows:
                string = string.lower()
                if skipset and string in skipset:
                    print "SKIPPING:", string
                    continue
                if p_malware == 1.0:
                    labels.append(1)
                elif p_malware < 1.0:
                    labels.append(0)
                strings.append(string)
                if len(strings) == max_samples:
                    break
            return strings, labels

        strings, labels = get_unique_data(cursor, args.max_samples)
        val_skipset = set(strings)

        if V_dbfile:
            V_vec, V_labels = get_unique_data(V_cursor, args.max_val_samples,
                                              val_skipset)

        datalen = len(labels)
        splitidx = int(args.split_ratio * datalen)
        train_strings, test_strings = strings[:splitidx], strings[splitidx:]
        train_labels, test_labels = labels[:splitidx], labels[splitidx:]

        if V_dbfile:
            return train_strings, test_strings, train_labels, test_labels, V_vec, V_labels
        else:
            return train_strings, test_strings, train_labels, test_labels
Example #51
0
def get_paper_authorlist(data, need_paper_set):
    paper_authorlist = ddict(set)
    for (paperid, authorid, name, affi) in data.paperauthor_tuples:
        paper_authorlist[paperid].add(authorid)
    return paper_authorlist
def analyze(collector):
    '''
    Simle analysis script for the cascade search,
    counting the number of identified peptides (combination of peptide sequence and modifications)
    and PSMs (additionally include the spectrum ID)
    '''

    mod_list = ['Oxidation', 'Deamidated', 'Methyl', 'Acetyl', 'Phospho']
    fieldnames = [
        'approach', 'count_type', 'validation_engine', 'unmodified',
        'multimodified'
    ] + mod_list + ['total']

    csv_writer = csv.DictWriter(open('cascade_results.csv', 'w'), fieldnames)
    csv_writer.writeheader()
    uc = ursgal.UController()
    uc.params['validation_score_field'] = 'PEP'
    uc.params['bigger_scores_better'] = False

    # Count the number of identified peptides and PSMs for the different modifications
    # Spectra with multiple PSMs are sanitized, i.e. only the PSM with best PEP score is counted
    # and only if the best hit has a PEP that is at least two orders of magnitude smaller than the others
    for validation_engine, result_file in collector.items():
        counter_dict = {'psm': ddict(set), 'pep': ddict(set)}
        grouped_psms = uc._group_psms(result_file,
                                      validation_score_field='PEP',
                                      bigger_scores_better=False)
        for spec_title, grouped_psm_list in grouped_psms.items():
            best_score, best_line_dict = grouped_psm_list[0]
            if len(grouped_psm_list) > 1:
                second_best_score, second_best_line_dict = grouped_psm_list[1]
                best_peptide_and_mod = best_line_dict[
                    'Sequence'] + best_line_dict['Modifications']
                second_best_peptide_and_mod = second_best_line_dict[
                    'Sequence'] + second_best_line_dict['Modifications']

                if best_peptide_and_mod == second_best_peptide_and_mod:
                    line_dict = best_line_dict
                elif best_line_dict['Sequence'] == second_best_line_dict[
                        'Sequence']:
                    if best_score == second_best_score:
                        line_dict = best_line_dict
                    else:
                        if (-1 * math.log10(best_score)) - (
                                -1 * math.log10(second_best_score)) >= 2:
                            line_dict = best_line_dict
                        else:
                            continue
                else:
                    if (-1 * math.log10(best_score)) - (
                            -1 * math.log10(second_best_score)) >= 2:
                        line_dict = best_line_dict
                    else:
                        continue
            else:
                line_dict = best_line_dict

            count = 0
            for mod in mod_list:
                if mod in line_dict['Modifications']:
                    count += 1
            key_2_add = ''
            if count == 0:
                key_2_add = 'unmodified'
            elif count >= 2:
                key_2_add = 'multimodified'
            elif count == 1:
                for mod in mod_list:
                    if mod in line_dict['Modifications']:
                        key_2_add = mod
                        break
            # for peptide identification comparison
            counter_dict['pep'][key_2_add].add(line_dict['Sequence'] +
                                               line_dict['Modifications'])
            # for PSM comparison
            counter_dict['psm'][key_2_add].add(line_dict['Spectrum Title'] +
                                               line_dict['Sequence'] +
                                               line_dict['Modifications'])
        for counter_key, count_dict in counter_dict.items():
            dict_2_write = {
                'approach': 'cascade',
                'count_type': counter_key,
                'validation_engine': validation_engine
            }
            total_number = 0
            for key, obj_set in count_dict.items():
                dict_2_write[key] = len(obj_set)
                total_number += len(obj_set)
            dict_2_write['total'] = total_number
            csv_writer.writerow(dict_2_write)
    return
def main(mzml=None):
    """
    Example script fort visualizing the m/z and intensity error, which is the
    basis for the scoring of the matches in pyQms.

    Use spectrum 1165 of the BSA1.mzML example file. A subrange of the spectrum
    from m/z 400 to 500 is used.

    Usage:
        ./visualize_scoring_information.py

    Note:
        This example does not require a reader to access MS spectra, since a
        simnple peak list is used.

    """

    peak_list = [
        (404.2492407565097, 2652.905029296875),
        (405.3003310237508, 4831.56103515625),
        (408.8403673369115, 23153.7109375),
        (409.17476109421705, 10182.2822265625),
        (409.5098740355617, 4770.97412109375),
        (411.17196124490727, 3454.364013671875),
        (413.26627826402705, 6861.84912109375),
        (419.3157903165357, 90201.5625),
        (420.2440507067882, 11098.4716796875),
        (420.31917273788645, 22288.9140625),
        (420.73825281590496, 8159.7099609375),
        (421.2406187369968, 3768.656494140625),
        (427.3787652898548, 5680.43212890625),
        (433.3316647490907, 8430.30859375),
        (434.705984428002, 25924.38671875),
        (435.2080179219357, 11041.2060546875),
        (443.6708762397708, 4081.282470703125),
        (443.69049198141124, 5107.13330078125),
        (443.6974813419733, 9135.3125),
        (443.7112735313511, 2517650.0),
        (443.7282222289076, 5571.26025390625),
        (443.7379762316008, 5227.4033203125),
        (444.1998579474954, 3021.341796875),
        (444.21248374593875, 1156173.75),
        (444.71384916266277, 336326.96875),
        (445.21533524843596, 58547.0703125),
        (445.71700965093, 4182.04345703125),
        (446.1200302053469, 93216.3359375),
        (447.09963627699824, 3806.537109375),
        (447.1169242266495, 59846.37109375),
        (447.3464079857604, 13170.9541015625),
        (448.11566395552086, 9294.5107421875),
        (448.3500303628631, 3213.052490234375),
        (452.1123280000919, 5092.0869140625),
        (461.1934526664677, 4022.537353515625),
        (462.1463969367603, 99732.5),
        (463.14561508666384, 24247.015625),
        (464.1433022096936, 20417.041015625),
        (465.1421080732791, 3222.4052734375),
        (470.1669593722212, 8621.81640625),
        (475.23989190282134, 3369.073974609375),
        (493.27465300375036, 2725.885986328125),
        (496.0077303201583, 8604.0830078125),
    ]
    print("{0:-^100}".format("Library generation"))
    lib = pyqms.IsotopologueLibrary(
        molecules=["DDSPDLPK"],
        charges=[2],
        metabolic_labels=None,
        fixed_labels=None,
        verbose=True,
    )
    print("{0:-^100}".format("Library generation"))

    results = lib.match_all(
        mz_i_list=peak_list,
        file_name="BSA_test",
        spec_id=1165,
        spec_rt=29.10,
        results=None,
    )
    for key, i, entry in results.extract_results():
        p = pymzml.plot.Factory()
        label_mz_error = []
        label_i_error = []
        measured_peaks = []
        matched_peaks = []
        peak_info = ddict(list)
        # pprint.pprint(entry.peaks)
        for (
                measured_mz,
                measured_intensity,
                relative_i,
                calculated_mz,
                calculated_intensity,
        ) in entry.peaks:
            if measured_mz is not None:
                measured_peaks.append((measured_mz, measured_intensity))
                matched_peaks.append(
                    (calculated_mz,
                     calculated_intensity * entry.scaling_factor))
                mz_error = (measured_mz - calculated_mz) / (measured_mz * 1e-6)
                label_mz_error.append(
                    (calculated_mz, "{0:5.3f} ppm m/z error".format(mz_error)))
                scaled_intensity = calculated_intensity * entry.scaling_factor
                rel_i_error = (abs(measured_intensity - scaled_intensity) /
                               scaled_intensity)

                peak_info["measured peaks"].append(measured_mz)
                peak_info["theoretical peaks"].append(calculated_mz)
                peak_info["relative intensity"].append(relative_i)
                peak_info["scaled matched peaks"].append(calculated_intensity *
                                                         entry.scaling_factor)
                peak_info["mz error"].append(mz_error)
                peak_info["i error"].append(rel_i_error)

                if rel_i_error > 1:
                    rel_i_error = 1

                label_i_error.append(
                    (calculated_mz,
                     "{0:5.3f} rel. intensity error".format(rel_i_error)))

        mz_only = [n[0] for n in measured_peaks]
        mz_range = [min(mz_only) - 1, max(mz_only) + 1]
        peptide = results.lookup["formula to molecule"][key.formula][0]
        p.newPlot(
            header=
            "Formula: {0}; Peptide: {1}; Charge: {2}\n Amount: {3:1.3f}; Score: {4:1.3f}"
            .format(key.formula, peptide, key.charge, entry.scaling_factor,
                    entry.score),
            mzRange=mz_range,
        )
        p.add(measured_peaks, color=(0, 0, 0), style="sticks")
        p.add(matched_peaks, color=(0, 200, 0), style="triangles")
        p.add(label_mz_error, color=(255, 0, 0), style="label_x")
        p.add(label_i_error, color=(255, 0, 0), style="label_x")

        plot_name = os.path.join(
            os.pardir,
            "data",
            "Score_visualization_Peptide_{1}_Charge_{2}.xhtml".format(
                key.file_name, peptide, key.charge),
        )
        p.save(filename=plot_name, mzRange=mz_range)
        print("Plotted file {0}".format(plot_name))
        # print(entry)
        print("Match info")
        for key, value_list in sorted(peak_info.items()):
            print(key)
            print("[{0}]".format(",".join([str(n) for n in value_list])))
            print()
    return
Example #54
0
def createGraphProxy(ssagraph):
    assert (not ssagraph.procs)  # should have already been inlined

    nodes = [
        BlockProxy(b.key, itertools.count(), block=b) for b in ssagraph.blocks
    ]
    allnodes = nodes[:]  # will also contain indirected nodes

    entryNode = None
    intypes = ddict(set)
    for n in nodes:
        invars = [phi.rval for phi in n.block.phis]
        for b, t in n.block.jump.getSuccessorPairs():
            intypes[b.key].add(t)

        if n.bkey == ssagraph.entryKey:
            assert (
                not entryNode and not invars
            )  # shouldn't have more than one entryBlock and entryBlock shouldn't have phis
            entryNode = n
            invars = ssagraph.inputArgs  # store them in the node so we don't have to keep track seperately
            invars = [
                x for x in invars if x is not None
            ]  # will have None placeholders for Long and Double arguments
        n.invars = invars

    lookup = {}
    for n in nodes:
        assert len(
            intypes[n.bkey]
        ) != 2  # should have been handled by graph.splitDualInedges()

        if False in intypes[n.bkey]:
            lookup[n.bkey, False] = n
        if True in intypes[n.bkey]:
            lookup[n.bkey, True] = n
    assert unique(lookup.values())

    for n in nodes:
        n.blockdict = lookup
        block = n.block
        for (block2, t) in block.jump.getSuccessorPairs():
            out = [phi.get((block, t)) for phi in block2.phis]

            n2 = lookup[block2.key, t]
            n.outvars[n2] = out
            n.successors.append(n2)
            n2.predecessors.append(n)

    # sanity check
    for n in allnodes:
        assert (n.block is not None) == (n.num == 0)
        assert (n is entryNode) == (len(n.predecessors) == 0)
        assert unique(n.predecessors)
        assert unique(n.successors)
        for pn in n.predecessors:
            assert n in pn.successors
        assert set(n.outvars) == set(n.successors)
        for sn in n.successors:
            assert n in sn.predecessors
            assert len(n.outvars[sn]) == len(sn.invars)

    return entryNode, allnodes
Example #55
0
    def preflight(self):
        '''
        Formatting the command line and writing the param input file via 
        self.params

        Returns:
                dict: self.params
        '''
        self.param_file_name = os.path.join(self.params['output_dir_path'],
                                            'msfragger.params')
        # further prepare and translate params

        # pprint.pprint(self.params['translations']['_grouped_by_translated_key'])
        # pprint.pprint(self.params)
        # exit()
        self.params_to_write = {
            'output_file_extension':
            'tsv',  # tsv or pepXML we fix it...
            'output_format':
            'tsv',  # pepXML or tsv
            'digest_mass_range':
            '{0} {1}'.format(
                self.params['translations']['_grouped_by_translated_key']
                ['precursor_min_mass']['precursor_min_mass'],
                self.params['translations']['_grouped_by_translated_key']
                ['precursor_max_mass']['precursor_max_mass'])
        }

        write_exclusion_list = [
            'precursor_min_mass', 'precursor_max_mass', 'precursor_min_charge',
            'precursor_max_charge', 'label', '-Xmx', 'header_translations',
            'validation_score_field'
        ]

        additional_15N_modifications = []
        if self.params['translations']['_grouped_by_translated_key']['label'][
                'label'] == '15N':
            self.print_info(
                'Search with label=15N may still be errorprone. Evaluate with care!',
                caller='WARNING')
            for aminoacid, N15_Diff in ursgal.ukb.DICT_15N_DIFF.items():
                existing = False
                for mod_dict in self.params['mods']['fix']:
                    if aminoacid == mod_dict['aa']:
                        mod_dict['mass'] += N15_Diff
                        mod_dict['name'] += '_15N_{0}'.format(aminoacid)
                        existing = True
                if existing == True:
                    continue
                else:
                    mod_key = 'add_{0}_{1}'.format(
                        aminoacid,
                        ursgal.chemical_composition_kb.aa_names[aminoacid])
                    self.params_to_write[mod_key] = N15_Diff

        for msfragger_param_name in self.params['translations'][
                '_grouped_by_translated_key'].keys():
            for ursgal_param_name, param_value in self.params['translations'][
                    '_grouped_by_translated_key'][msfragger_param_name].items(
                    ):
                if msfragger_param_name in write_exclusion_list:
                    continue
                elif msfragger_param_name == 'enzyme':
                    '''
                    search_enzyme_name = Trypsin
                    search_enzyme_cutafter = KR
                    search_enzyme_butnotafter = P
                    '''
                    aa_site, term, inhibitor = param_value.split(';')
                    self.params_to_write['search_enzyme_name'] = self.params[
                        'enzyme']
                    self.params_to_write['search_enzyme_cutafter'] = aa_site
                    self.params_to_write[
                        'search_enzyme_butnotafter'] = inhibitor
                elif msfragger_param_name == 'num_enzyme_termini':
                    # num_enzyme_termini = 2 # 2 for enzymatic, 1 for
                    # semi-enzymatic, 0 for nonspecific digestion

                    if self.params['translations'][
                            '_grouped_by_translated_key']['enzyme'][
                                'enzyme'] == 'nonspecific':
                        self.params_to_write[msfragger_param_name] = 0
                    else:
                        self.params_to_write[
                            msfragger_param_name] = param_value
                elif msfragger_param_name == 'clear_mz_range':
                    min_mz, max_mz = param_value
                    self.params_to_write[
                        msfragger_param_name] = '{0} {1}'.format(
                            min_mz, max_mz)
                elif msfragger_param_name == 'modifications':
                    '''
                    #maximum of 7 mods - amino acid codes, * for any amino acid, [ and ] specifies protein termini, n and c specifies peptide termini
                    variable_mod_01 = 15.9949 M
                    variable_mod_02 = 42.0106 [*
                    #variable_mod_03 = 79.96633 STY
                    #variable_mod_03 = -17.0265 nQnC
                    #variable_mod_04 = -18.0106 nE
                    '''
                    # print(self.params['translations']['_grouped_by_translated_key'][msfragger_param_name])
                    # pprint.pprint(self.params[ 'mods' ])
                    # exit()
                    mass_to_mod_aa = ddict(list)
                    for mod_dict in self.params['mods']['opt']:
                        '''
                        {'_id': 0,
                          'aa': '*',
                          'composition': {'C': 2, 'H': 2, 'O': 1},
                          'id': '1',
                          'mass': 42.010565,
                          'name': 'Acetyl',
                          'org': '*,opt,Prot-N-term,Acetyl',
                          'pos': 'Prot-N-term',
                          'unimod': True},
                        '''
                        aa_to_append = mod_dict['aa']
                        pos_modifier = None
                        if mod_dict['pos'] == 'Prot-N-term':
                            pos_modifier = '['
                        elif mod_dict['pos'] == 'Prot-C-term':
                            pos_modifier = ']'
                        elif mod_dict['pos'] == 'N-term':
                            pos_modifier = 'n'
                        elif mod_dict['pos'] == 'C-term':
                            pos_modifier = 'c'
                        elif mod_dict['pos'] == 'any':
                            pass
                        else:
                            print('''
                            Unknown positional argument for given modification:
                            {0}
                            MSFragger cannot deal with this, please use one of the follwing:
                            any, Prot-N-term, Prot-C-term, N-term, C-term
                            '''.format(mod_dict['org']))
                            sys.exit(1)
                        if pos_modifier is not None:
                            aa_to_append = '{0}{1}'.format(
                                pos_modifier, aa_to_append)
                        mass_to_mod_aa[mod_dict['mass']].append(aa_to_append)
                    for pos, (mass,
                              aa_list) in enumerate(mass_to_mod_aa.items()):
                        self.params_to_write['variable_mod_0{0}'.format(
                            pos + 1)] = '{0} {1}'.format(
                                mass, ''.join(aa_list))
                    for mod_dict in self.params['mods']['fix']:
                        '''
                        add_C_cysteine = 57.021464             # added to C - avg. 103.1429, mono. 103.00918
                        '''
                        if mod_dict['pos'] == 'Prot-N-term':
                            mod_key = 'add_Nterm_protein'
                        elif mod_dict['pos'] == 'Prot-C-term':
                            mod_key = 'add_Cterm_protein'
                        elif mod_dict['pos'] == 'N-term':
                            mod_key = 'add_Nterm_peptide'
                        elif mod_dict['pos'] == 'C-term':
                            mod_key = 'add_Cterm_peptide'
                        else:
                            mod_key = 'add_{0}_{1}'.format(
                                mod_dict['aa'],
                                ursgal.chemical_composition_kb.aa_names[
                                    mod_dict['aa']])
                        self.params_to_write[mod_key] = mod_dict['mass']

                elif msfragger_param_name == 'override_charge':
                    self.params_to_write[msfragger_param_name] = param_value
                    if param_value == 1:
                        self.params_to_write[
                            'precursor_charge'] = '{0} {1}'.format(
                                self.params['translations']
                                ['_grouped_by_translated_key']
                                ['precursor_min_charge']
                                ['precursor_min_charge'],
                                self.params['translations']
                                ['_grouped_by_translated_key']
                                ['precursor_max_charge']
                                ['precursor_max_charge'])

                else:
                    self.params_to_write[msfragger_param_name] = param_value
        self.write_params_file()

        self.input_file = os.path.join(self.params['input_dir_path'],
                                       self.params['input_file'])
        if self.input_file.lower().endswith('.mzml') or \
                self.input_file.lower().endswith('.mzml.gz') or \
                self.input_file.lower().endswith('.mgf'):
            self.params['translations']['mzml_input_file'] = self.input_file
        # elif self.input_file.lower().endswith('.mgf'):
        #     self.params['translations']['mzml_input_file'] = \
        #         self.meta_unodes['ucontroller'].get_mzml_that_corresponds_to_mgf( self.input_file )
        #     self.print_info(
        #         'MSFragger can only read Proteowizard MGF input files,'
        #         'the corresponding mzML file {0} will be used instead.'.format(
        #             os.path.abspath(self.params['translations']['mzml_input_file'])
        #         ),
        #         caller = "INFO"
        # )
        else:
            raise Exception(
                'MSFragger input spectrum file must be in mzML or MGF format!')

        # pprint.pprint(self.params['translations'])
        # exit()
        self.params['command_list'] = [
            'java',
            '-Xmx{0}'.format(self.params['translations']
                             ['_grouped_by_translated_key']['-Xmx']['-xmx']),
            '-jar', self.exe, self.param_file_name,
            self.params['translations']['mzml_input_file']
        ]

        self.params['translations']['output_file_incl_path'] = os.path.join(
            self.params['output_dir_path'], self.params['output_file'])
        return self.params
Example #56
0
    def load_data(self):
        triple_set = []
        with open(self.p.triple2id, 'r') as filein:
            for line in filein:
                tup = (int(line.split()[0].strip()),
                       int(line.split()[1].strip()),
                       int(line.split()[2].strip()))
                triple_set.append(tup)
        triple_set = set(triple_set)

        train_triples = []
        self.start_time, self.end_time, self.num_class = ddict(dict), ddict(
            dict), ddict(dict)
        triple_time, entity_time = dict(), dict()
        self.inp_idx, self.start_idx, self.end_idx, self.labels = ddict(
            list), ddict(list), ddict(list), ddict(list)
        max_ent, max_rel, count = 0, 0, 0

        with open(self.p.dataset, 'r') as filein:
            for line in filein:
                train_triples.append(
                    [int(x.strip()) for x in line.split()[0:3]])
                triple_time[count] = [
                    x.split('-')[0] for x in line.split()[3:5]
                ]
                count += 1

        # self.start_time['triple'], self.end_time['triple'] = self.create_year2id(triple_time,'triple')

        with open(self.p.entity2id, 'r', encoding="utf-8") as filein2:
            for line in filein2:
                # entity_time[int(line.split('\t')[1])]=[x.split()[0] for x in line.split()[2:4]]
                max_ent = max_ent + 1

        self.year2id = self.create_year2id(triple_time)
        # self.start_time['entity'], self.end_time['entity'] = self.create_year2id(entity_time,'entiy')
        # self.inp_idx['entity'],self.start_idx['entity'], self.end_idx['entity'] = self.create_id_labels(entity_time,'entity')
        self.inp_idx['triple'], self.start_idx['triple'], self.end_idx[
            'triple'] = self.create_id_labels(triple_time, 'triple')
        #pdb.set_trace()
        for i, ele in enumerate(self.inp_idx['entity']):
            if self.start_idx['entity'][i] > self.end_idx['entity'][i]:
                print(self.inp_idx['entity'][i], self.start_idx['entity'][i],
                      self.end_idx['entity'][i])
        self.num_class = len(self.year2id.keys())

        # for dtype in ['entity','triple']:
        # 	self.labels[dtype] = self.getOneHot(self.start_idx[dtype],self.end_idx[dtype], self.num_class)# Representing labels by one hot notation

        keep_idx = set(self.inp_idx['triple'])
        for i in range(len(train_triples) - 1, -1, -1):
            if i not in keep_idx:
                del train_triples[i]

        with open(self.p.relation2id, 'r') as filein3:
            for line in filein3:
                max_rel = max_rel + 1
        index = randint(1, len(train_triples)) - 1

        posh, rela, post = zip(*train_triples)
        head, rel, tail = zip(*train_triples)

        posh = list(posh)
        post = list(post)
        rela = list(rela)

        head = list(head)
        tail = list(tail)
        rel = list(rel)

        for i in range(len(posh)):
            if self.start_idx['triple'][i] < self.end_idx['triple'][i]:
                for j in range(self.start_idx['triple'][i] + 1,
                               self.end_idx['triple'][i] + 1):
                    head.append(posh[i])
                    rel.append(rela[i])
                    tail.append(post[i])
                    self.start_idx['triple'].append(j)

        self.ph, self.pt, self.r,self.nh, self.nt , self.triple_time  = [], [], [], [], [], []
        for triple in range(len(head)):
            neg_set = set()
            for k in range(self.p.M):
                possible_head = randint(0, max_ent - 1)
                while (possible_head, rel[triple], tail[triple]
                       ) in triple_set or (possible_head, rel[triple],
                                           tail[triple]) in neg_set:
                    possible_head = randint(0, max_ent - 1)
                self.nh.append(possible_head)
                self.nt.append(tail[triple])
                self.r.append(rel[triple])
                self.ph.append(head[triple])
                self.pt.append(tail[triple])
                self.triple_time.append(self.start_idx['triple'][triple])
                neg_set.add((possible_head, rel[triple], tail[triple]))

        for triple in range(len(tail)):
            neg_set = set()
            for k in range(self.p.M):
                possible_tail = randint(0, max_ent - 1)
                while (head[triple], rel[triple], possible_tail
                       ) in triple_set or (head[triple], rel[triple],
                                           possible_tail) in neg_set:
                    possible_tail = randint(0, max_ent - 1)
                self.nh.append(head[triple])
                self.nt.append(possible_tail)
                self.r.append(rel[triple])
                self.ph.append(head[triple])
                self.pt.append(tail[triple])
                self.triple_time.append(self.start_idx['triple'][triple])
                neg_set.add((head[triple], rel[triple], possible_tail))

        # self.triple_time = triple_time
        # self.entity_time = entity_time
        self.max_rel = max_rel
        self.max_ent = max_ent
        self.max_time = len(self.year2id.keys())
        self.data = list(
            zip(self.ph, self.pt, self.r, self.nh, self.nt, self.triple_time))
        self.data = self.data + self.data[0:self.p.batch_size]
Example #57
0
def parse_evidence(
    fixed_labels=None,
    evidence_files=None,
    molecules=None,
    evidence_score_field=None,
    return_raw_csv_data=False,
):
    """
    Reads in the evidence file and returns the final formatted fixed labels,
    the evidence lookup, which is passed to the isotopologue library and the
    final formatted molecules (fixed labels are stripped form the molecules).

    Note:

        Output .csv files from `Ursgal`_ (`Documentation`_) can directly be
        used. Also `mzTab`_ files can be used as input.

    .. _Ursgal:
        https://github.com/ursgal/ursgal

    .. _Documentation:
        http://ursgal.readthedocs.io/en/latest/

    .. _mzTab:
        http://www.psidev.info/mztab

    Args:
        fixed_labels (dict): dict with fixed labels, example format is shown
            below.
        evidence_files (list): list of evidence file paths.
        molecules (list): list of additional molecules
        evidence_score_field (str): specify fieldname which holds the search
            engine score (Default is "PEP")

    Example fixed label format::

        {
            'C' : [
                {
                    'element': {
                        'O': 1,
                        'H': 3,
                        '14N': 1,
                        'C': 2
                    },
                    'evidence_mod_name': 'Carbamidomethyl'
                },
            ]
        }

    Returns:

        tuple: final formatted fixed label dict, evidence lookup, list of molecules

    """
    if molecules is None:
        molecules = []
    if evidence_score_field is None:
        evidence_score_field = "PEP"  #  default

    unimod_parser = pyqms.UnimodMapper()

    fixed_mod_lookup = {}
    amino_acid_2_fixed_mod_name = ddict(list)

    formatted_fixed_labels = None
    evidence_lookup = None
    molecule_set = set()

    all_fixed_mod_names = set()

    if fixed_labels is not None and len(fixed_labels.keys()) != 0:
        formatted_fixed_labels = {}
        for aa, fixed_mod_info_dict_list in fixed_labels.items():
            for fixed_mod_info_dict in fixed_mod_info_dict_list:
                if isinstance(fixed_mod_info_dict["element_composition"],
                              dict):
                    tmp_cc_factory = pyqms.chemical_composition.ChemicalComposition(
                    )
                    tmp_cc_factory.add_chemical_formula(
                        fixed_mod_info_dict["element_composition"])
                else:
                    tmp_cc_factory = fixed_mod_info_dict["element_composition"]
                # print(type(tmp_cc_factory))
                # print(fixed_mod_info_dict)
                if aa not in formatted_fixed_labels.keys():
                    formatted_fixed_labels[aa] = []
                formatted_fixed_labels[aa].append(
                    tmp_cc_factory.hill_notation_unimod())
                # save it under name and amino acid!
                fixed_mod_lookup[fixed_mod_info_dict[
                    "evidence_mod_name"]] = dc(tmp_cc_factory)
                amino_acid_2_fixed_mod_name[aa].append(
                    fixed_mod_info_dict["evidence_mod_name"])
                all_fixed_mod_names.add(
                    fixed_mod_info_dict["evidence_mod_name"])
                tmp_cc_factory.clear()

    cc_factory = pyqms.chemical_composition.ChemicalComposition()

    # this is the lookup for the lib with the evidences
    # tmp_evidences = ddict(list)
    tmp_evidences = {}

    csv_raw_data_to_return = {}
    # tmp_charges_of_evidences = set()
    for evidence_file in evidence_files:
        input_is_csv = False
        evidence_lookup = {}
        with codecs.open(evidence_file, mode="r",
                         encoding="utf-8") as openend_evidence_file:
            # first buffer the file here depending on mztab andf csv input
            if evidence_file.upper().endswith("CSV"):
                dict_reader = csv.DictReader(openend_evidence_file)
                modification_fieldname = "Modifications"
                rt_fieldname = "Retention Time (s)"
                seq_fieldname = "Sequence"
                input_is_csv = True
            elif evidence_file.upper().endswith("MZTAB"):
                dict_reader = csv.DictReader(
                    [
                        row for row in openend_evidence_file
                        if row[:3] in ["PSM", "PSH"]
                    ],
                    delimiter="\t",
                )
                modification_fieldname = "modifications"
                rt_fieldname = "retention_time"
                seq_fieldname = "sequence"
            else:
                print(
                    "The format {0} is not recognized by the pyQms adaptor function"
                    .format(os.path.splitext(evidence_file)[1]))

            input_buffer = []
            for line_dict in dict_reader:
                input_buffer.append(line_dict)
            csv_raw_data_to_return[evidence_file] = input_buffer
            for line_dict in input_buffer:

                modifications = line_dict.get(modification_fieldname, "")
                if modifications == "":
                    molecule = line_dict[seq_fieldname]
                else:
                    if input_is_csv:
                        formatted_mods = line_dict[modification_fieldname]
                    else:
                        formatted_mods = []
                        # 2-UNIMOD:4,3-UNIMOD:4
                        for pos_and_unimod_id in line_dict[
                                modification_fieldname].split(","):
                            pos, unimod_id = pos_and_unimod_id.split("-")
                            unimod_name = unimod_parser.id2name(
                                unimod_id.split(":")[1])
                            formatted_mods.append("{0}:{1}".format(
                                unimod_name, pos))
                        formatted_mods = ";".join(formatted_mods)

                    molecule = "{0}#{1}".format(line_dict[seq_fieldname],
                                                formatted_mods)

                dict_2_append = {}
                rt = line_dict.get(rt_fieldname, "")
                # seconds is the standard also for mzTab
                if rt != "":
                    dict_2_append["RT"] = float(rt) / 60.0  # always in min

                score = line_dict.get(evidence_score_field, "")
                if score != "":
                    dict_2_append["score"] = float(score)
                    dict_2_append["score_field"] = evidence_score_field
                else:
                    dict_2_append["score"] = "None"
                    dict_2_append["score_field"] = "None"

                if molecule not in tmp_evidences.keys():
                    tmp_evidences[molecule] = {
                        "evidences": [],
                        "trivial_names": set()
                    }
                tmp_evidences[molecule]["evidences"].append(dict_2_append)
                for trivial_name_key in [
                        "proteinacc_start_stop_pre_post_;",  # old ursgal style
                        "trivial_name",  # self defined name
                        "Protein ID",  # new ursgal style
                        "accession",  # mzTab style
                ]:
                    additional_name = line_dict.get(trivial_name_key, "")
                    if additional_name != "":
                        # use set to remove double values
                        tmp_evidences[molecule]["trivial_names"].add(
                            additional_name)

    mod_pattern = re.compile(r""":(?P<pos>[0-9]*$)""")

    all_molecules = list(molecules)

    if len(tmp_evidences.keys()) > 0:
        all_molecules += list(tmp_evidences.keys())

    for molecule_and_mods in sorted(all_molecules):
        # try to convert trivial name set to list for conveniences
        try:
            tmp_evidences[molecule_and_mods]["trivial_names"] = sorted(
                list(set(tmp_evidences[molecule_and_mods]["trivial_names"])))
        except:
            pass
        # print(molecule_and_mods)
        if "#" in molecule_and_mods:
            molecule, modifications = molecule_and_mods.split("#")
        else:
            molecule = molecule_and_mods
            modifications = None
        fixed_label_mod_addon_names = []
        if modifications is not None:
            mods_to_delete = []
            mod_list = modifications.split(";")
            for pos_in_mod_list, mod_and_pos in enumerate(mod_list):
                # OLD STYLE, no ':' in mod allowed!
                # mod, pos = mod_and_pos.split(':')
                # NEW STYLE, SILAC does not crash...
                for match in mod_pattern.finditer(mod_and_pos):
                    pos = int(match.group("pos"))
                    mod = mod_and_pos[:match.start()]
                    break

                modded_aa = molecule[int(pos) - 1]

                if (formatted_fixed_labels is not None
                        and modded_aa in formatted_fixed_labels.keys()
                        and mod in all_fixed_mod_names):
                    fixed_label_mod_addon_names.append(mod)
                    mods_to_delete.append(pos_in_mod_list)

            for modpos_2_remove in sorted(mods_to_delete, reverse=True):
                mod_list.pop(modpos_2_remove)

            if len(mod_list) > 0:
                molecule = "{0}#{1}".format(molecule, ";".join(mod_list))
            else:
                # nosetest does not line else and pass
                # molecule = molecule
                pass
        else:
            # fail check if fixed mod is not in the modifications!
            # add all fixed modification!
            if formatted_fixed_labels is not None:
                for aa in molecule:
                    if aa in formatted_fixed_labels.keys():
                        for mod_name in amino_acid_2_fixed_mod_name[aa]:
                            fixed_label_mod_addon_names.append(mod_name)
        # print(molecule)
        if molecule.startswith("+"):
            cc_factory.add_chemical_formula(molecule)
        else:
            cc_factory.use(molecule)
        if len(fixed_label_mod_addon_names) != 0:
            for fixed_mod_name in fixed_label_mod_addon_names:
                cc_factory.add_chemical_formula(
                    fixed_mod_lookup[fixed_mod_name])
        complete_formula = cc_factory.hill_notation_unimod()

        molecule_set.add(molecule)
        if molecule_and_mods in tmp_evidences.keys():
            if complete_formula not in evidence_lookup.keys():
                evidence_lookup[complete_formula] = {}
            evidence_lookup[complete_formula][
                molecule_and_mods] = tmp_evidences[molecule_and_mods]

        cc_factory.clear()

    molecule_list = list(molecule_set)

    if return_raw_csv_data:
        return (
            formatted_fixed_labels,
            evidence_lookup,
            molecule_list,
            csv_raw_data_to_return,
        )
    else:
        return formatted_fixed_labels, evidence_lookup, molecule_list
Example #58
0
    def __init__(
            self,
            filter_freq=200,
            filter_stages=[],
            url_stats='result/matrix?show_stage_details=true&show_item_details=true',
            url_rules='formula',
            path_stats='data/matrix.json',
            path_rules='data/formula.json',
            update=False,
            banned_stages={},
            #                 expValue=30,
            ConvertionDR=0.18,
            display_main_only=True):
        """
        Object initialization.
        Args:
            filter_freq: int or None. The lowest frequence that we consider.
                No filter will be applied if None.
            url_stats: string. url to the dropping rate stats data.
            url_rules: string. url to the composing rules data.
            path_stats: string. local path to the dropping rate stats data.
            path_rules: string. local path to the composing rules data.
        """
        try:
            material_probs, self.convertion_rules = load_data(
                path_stats, path_rules)
        except:
            print(
                'exceptRequesting data from web resources (i.e., penguin-stats.io)...',
                end=' ')
            material_probs, self.convertion_rules = request_data(
                penguin_url + url_stats, penguin_url + url_rules, path_stats,
                path_rules)
            print('done.')
        if update:
            print(
                'Requesting data from web resources (i.e., penguin-stats.io)...',
                end=' ')
            material_probs, self.convertion_rules = request_data(
                penguin_url + url_stats, penguin_url + url_rules, path_stats,
                path_rules)
            print('done.')

        self.exp_factor = 1

        self.material_probs = material_probs
        self.banned_stages = banned_stages
        self.display_main_only = display_main_only
        self.stage_times = ddict(int)

        filtered_probs = []
        needed_stage = []
        for dct in material_probs['matrix']:
            if dct['times'] > self.stage_times[dct['stage'][
                    'code']] or self.stage_times[dct['stage']['code']] == 0:
                self.stage_times[dct['stage']['code']] = dct['times']
            if dct['times'] >= filter_freq and dct['stage'][
                    'code'] not in filter_stages:
                filtered_probs.append(dct)
            elif dct['stage']['code'] not in needed_stage:
                needed_stage.append(dct['stage']['code'])
        material_probs['matrix'] = filtered_probs
        self.ConvertionDR = ConvertionDR
        self._pre_processing(material_probs)
        self._set_lp_parameters()
Example #59
0
    def group_styles(self):
        '''
        Parses self.items() and build up lookups.
        Additionally, consistency check is performed to guarantee that each
        engine is mapping only on one style.

        The lookup build and returned looks like::

            lookup = {
                'style_2_engine' : {
                    'xtandem_style_1' : [
                        'xtandem_sledgehamer',
                        'xtandem_cylone',
                        ...
                    ],
                    'omssa_style_1' ...
                },
                # This is done during uNode initializations
                # each unode will register its style with umapmaster
                #
                'engine_2_style' : {
                    'xtandem_sledgehamer' : 'xtandem_style_1', ...
                },
                'engine_2_params' : {
                    'xtandem_sledgehamer' : [ uparam1, uparam2, ...], ...
                },
                'style_2_params' : {
                    'xtandem_style_1' : [ uparam1, uparam2, ... ], ...
                },
                'params_triggering_rerun' : {
                    'xtandem_style_1' : [ uparam1, uparam2 .... ]
                }
            }
        '''
        lookup = {
            'style_2_engine': ddict(set),
            'engine_2_style': {},
            # these two are not in the docu yet ...
            'engine_2_params': ddict(list),
            'style_2_params': ddict(list),
            'params_triggering_rerun': ddict(list)
        }
        for uparam, udict in sorted(self.items()):
            # print( uparam, end = '\t')
            # if uparam == 'force':
            #     print(udict)
            for style in udict['ukey_translation'].keys():
                try:
                    style_basename, style_version = style.split('_style_')
                except:
                    print('Syntax Error @ uparam {0}'.format(uparam))
                    print('style : {0}'.format(style))
                    exit(1)
                vvv = False
                if style_basename == 'ucontroller':
                    vvv = True
                    # print('UController params {0}'.format( uparam ))
                # else:
                # print()
                styles_seen = set()
                for engine in udict['available_in_unode']:
                    # if vvv:
                    #     print(engine )
                    if style_basename not in engine:
                        continue
                    # this style 2 engine lookup is not quite right ...
                    # This function requires unode meta info for proper
                    # mapping ....
                    # lookup['style_2_engine'][ style ].add( engine )
                    lookup['engine_2_params'][engine].append(uparam)

                    if style not in styles_seen:
                        lookup['style_2_params'][style].append(uparam)
                        if udict.get('triggers_rerun', True):
                            lookup['params_triggering_rerun'][style].append(
                                uparam)
                        styles_seen.add(style)

                    parsed_e2s = lookup['engine_2_style'].get(engine, None)
                    if parsed_e2s is None:
                        lookup['engine_2_style'][engine] = style
                    else:
                        if parsed_e2s != style:
                            print('{0} was found to map on style {1} and {2}'.
                                  format(engine, parsed_e2s, style))
        return lookup
Example #60
0
 def __init__(self, prop, **kwargs):
     super(PropMerge, self).__init__(name=prop, **kwargs)
     self.seen = set()
     self._syms = ddict(list)