Ejemplo n.º 1
0
def main(args):
    df = pd.read_csv(args.metadatafile)
    if not os.path.exists(args.outputdir):
        os.makedirs(args.outputdir)

    # drop rows that describe metadata sheets only (no Code)
    df.dropna(subset=["Code"], inplace=True)

    # remove the rows where Conditions are undefined
    undef = "undefined"
    df = df.query("Conditions != @undef")

    # make a dictionary to store, for each code, the list of
    # filenames for each replicate
    codemap = dd(lambda: dd(list))
    for row_index, row in df.iterrows():
        filename = row["Filename"]
        code = row["Code"]
        # extract the replicate and then remove to get code only
        pattern = re.compile("-(\d+)")
        m = re.search(pattern, code)
        replicate = m.group(1)
        codeonly = re.sub(pattern, "", code)
        # print filename,code,codeonly,replicate
        codemap[codeonly][replicate].append(filename)

    for code, replist in codemap.items():
        concat(code, replist)
Ejemplo n.º 2
0
    def __init__(self):
        self.data = []
        self.tag = []
        self.ngram = 4
        self.types = dd(int)
        self.frameType = 'fre'
        self.frames = dd(lambda: dd(int))
        self.threshold = 45 #remove frames lower < threshold (default top45)
        self.tokenCount = 0
        self.utteranceCount = 0
        self.debug = 1
        self.filterFrameCount = 0
        self.filterUtterBound = False
        self.removeUtterBound = False
        self.filterTokenTags = False
        self.quiet = True
#        self.tagMap={'n':'n','pro':'n','adj':'adj', 'adv':'adv','conj':'conj',\
#                         'det':'det', 'qn':'det', 'prep':'prep', 'v':'v', 'aux':'v',\
#                         'part':'v', 'mod':'v', 'neg':'neg', \
#                         'co':'int', 'int':'int', 'wh':'wh'}
        self.tagMap={'n':'n', 'pro':'n', 'adj':'adj', 'adv':'adv', 'conj':'conj', 'det':'det',\
            'prep':'prep', 'v':'v', 'part':'v', 'mod':'v', 'aux':'v','int':'int',\
            'neg':'neg', 'wh':'wh'}
        #### regs ###
        self.regUtterBound = re.compile('^P_')
        self.regWordTag = re.compile('^(.*?)\/(.*?):')
        self.regWhTag = re.compile('^.*?\/.*?:wh:')
        self.regNoTag = re.compile('^(.*?)\/_?$')
 def takeQueries(self):
   while 1:
     word = raw_input("Enter a word:")
     if word == 'exit':
       break
     print "CK func return value:", self.isCK(word)
     print "xSub func return value:", self.xSub(word, dd(set), dd(set), 0)
Ejemplo n.º 4
0
    def compute_gradient(self, generative_model):
        gradient = dd(float)
        recprob = np.exp(self.partition) / np.sum(np.exp(self.partition))
        jointprob = np.copy(recprob)
        for latent_states in itertools.product(self.latent_states, repeat=4):
            for i in range(4):
                jointprob[latent_states] *= generative_model.px_given_z(self.sentence[i], latent_states[i])
        jointprob /= np.sum(jointprob)

        # Calculate feature expectation over q(z|x)
        feat_exp = dd(float)
        for latent_states in itertools.product(self.latent_states, repeat=4):
            for fcn in self.feature_functions()[1]:
                for i in range(4):
                    feat_exp[fcn(i, self.sentence, latent_states[i])] += recprob[latent_states]
            for fcn in self.feature_functions()[3]:
                feat_exp[fcn(0, self.sentence, *latent_states[0:3])] += recprob[latent_states]
                feat_exp[fcn(1, self.sentence, *latent_states[1:4])] += recprob[latent_states]

        # Calculate the gradient
        for latent_states in itertools.product(self.latent_states, repeat=4):
            #print latent_states, jointprob[latent_states]
            temp = dd(float)
            for fcn in self.feature_functions()[1]:
                for i in range(4):
                    temp[fcn(i, self.sentence, latent_states[i])] += 1
            for fcn in self.feature_functions()[3]:
                temp[fcn(0, self.sentence, *latent_states[0:3])] += 1
                temp[fcn(1, self.sentence, *latent_states[1:4])] += 1
            for feature in feat_exp:
                gradient[feature] += jointprob[latent_states] * (temp[feature] - feat_exp[feature])

        return gradient
 def __init__(self, text):
     super(Word, self).__init__()
     self.text = text
     self.freq = 0.0
     self.left = dd(int)
     self.right = dd(int)
     self.aggreg = 0.0
Ejemplo n.º 6
0
    def __init__(self,area):
        
        self.initiallat = area[3][0]
        self.initiallon = area[3][1]
        self.dist_range = 5.0 #nm
       
        self.alt_range = 1000.0 #ft
        self.t_cpa = 0
        self.dist_cpa = 0
        self.spd = np.array([])
        self.lat = np.array([])
        self.lon = np.array([])
        self.pos = np.array([])
        self.trk = 0

        self.alt_dif = 0
        self.alt = 0
        self.id = []
        
        self.complexity = dd(lambda:dd(int))
        self.rel_trk = np.array([])
        self.step = -1
        self.id_previous = []
        self.headings = []
        self.headings_previous = np.array([])
        self.doubleconflict = 0
        self.ntraf = 0
        self.compl_ac = 0
        self.time_lookahead = 1800 #seconds
        
        self.selected_area = ([area[0][0],area[0][1]],[area[1][0],area[1][1]],[area[2][0],area[2][1]],[area[3][0],area[3][1]])

        return
 def __init__(self):
   self.posts = []
   self.userwiseThreads = dd(set)
   self.userwisePosts = dd(set) # Stores indices
   self.userStart = dd(lambda:5000)
   self.maxDay = -1
   self.userWeekwisePosts = dd(lambda:dd(list))
   self.userWeekwiseAccusations = dd(lambda:dd(set))
   self.fakeUsers = set() # Stores the postId of the previous fake annotation we did
   self.fakeUsersPosts = {}
   self.postIdMap = {}
   self.nonFakeUsers = set()
   self.twitterLexicon = set()
   self.pkWords = set(["napkin", "pumpkin", "pk", "upkeep"])
   self.kcWords = set(["kc", "backcast", "backcloth", "blackcock", "blackcurrant", "bookcase", "cockchafer", "dickcissel", "kekchi", "kinkcough",
                       "lockchester", "markcourt", "neckcloth", "packcloth", "sackcloth"])
   self.bkWords = set(["bk", "abk", "ebk", "bks", "abks", "ebks", "bkz", "abkz", "ebkz"])
   self.addPlurals(self.pkWords)
   self.pkWords.add("pk's")
   self.ccWords = set()
   self.ckWords = set()
   self.loadLexiconForCC()
   self.loadLexiconForCK()
   self.loadTwitterLexicon("")
   self.addAAESuffEnds(self.twitterLexicon)
   self.addXtreme(self.twitterLexicon)
   self.addPlurals(self.twitterLexicon)
   self.features = set(["cc", "ck", "bk", "pk", "hk", "oe", "3", "5", "6", "x", 'nword', 'hood', 'bCaret', 'cCaret', 'pCaret', 'hCaret'])
   self.activeForums = {}
   
   self.wordsNotConsideredLater = dd(int)
   self.wordsConsidered = dd(lambda:dd(int))
   self.consideredWordsCount = 0
Ejemplo n.º 8
0
def load_key(fname):

    print >> sys.stderr, "loading %s" % fname
    d = dd(lambda: dd(lambda: dd(lambda : 0.)))

    lines = open(fname).readlines()
    #c = 0
    for line in lines:
        line = line.split()
        key, inst = line[:2]
        senses = line[2:]
        senses = [sense.split('/') for sense in senses]
        if len(senses) == 1:
            #c += 1
            d[key][inst][senses[0][0]] = 1.
        else:
            uni = []
            for sense in senses:
                if len(sense) == 1:
                    uni.append(sense)
                else:
                    d[key][inst][sense[0]] = float(sense[1])
            if len(uni) > 0:
                assert len(uni) != len(senses), "Some sense weighted, some not: %s" % inst
                val = 1. / len(uni)
                for sense in senses:
                    d[key][inst][sense[0]] = val
    return d
Ejemplo n.º 9
0
Archivo: A.py Proyecto: sbogdan/codejam
def solve():
  global A, motes, states
  motes.sort()
  states = [dd(int), dd(int)]
  old, new = 0, 1
  # state[idx, ops] = size
  states[new][0] = A
  shortcuts = set()

  for idx in range(0, len(motes)):
    new, old = old, new
    states[new] = dd(int)
    for ops in states[old]:
      size = states[old][ops]
      if size > motes[idx]:
        states[new][ops] = max(states[new][ops], size + motes[idx])
      elif size <= motes[idx]:
        # eat new particles
        if size != 1:
          tmp_size = size
          tmp_ops = 0
          while tmp_size <= motes[idx]:
            tmp_size += tmp_size - 1
            tmp_ops += 1
          states[new][ops+tmp_ops] = max(states[new][ops+tmp_ops], tmp_size + motes[idx])
            
        # eliminate next particle
        # states[new][ops+1] = max(states[new][ops+1], size)
        shortcuts.add(ops + len(motes) - idx)
  if len(shortcuts) and len(states[new]):
    return min(min(states[new].keys()), min(shortcuts))
  return min(shortcuts) if len(shortcuts) else min(states[new].keys())
Ejemplo n.º 10
0
 def fetch_pos():
     pos_id = dd(lambda: dd())
     for r in query_omw("""SELECT id, tag, def FROM pos"""):
         pos_id['id'][r['id']]=r['tag']
         pos_id['tag'][r['tag']]=r['id']
         pos_id['def'][r['id']]=r['def']
     return pos_id
Ejemplo n.º 11
0
    def gen_label_graph(self):
        labels, label2inst, not_label = [], dd(list), dd(list)
        for i in range(self.x.shape[0]):
            flag = False
            for j in range(self.y.shape[1]):
                if self.y[i, j] == 1 and not flag:
                    labels.append(j)
                    label2inst[j].append(i)
                    flag = True
                elif self.y[i, j] == 0:
                    not_label[j].append(i)

        while True:
            g, gy = [], []
            for _ in range(self.g_sample_size):
                x1 = random.randint(0, self.x.shape[0] - 1)
                label = labels[x1]
                if len(label2inst) == 1: continue
                x2 = random.choice(label2inst[label])
                g.append([x1, x2])
                gy.append(1.0)
                for _ in range(self.neg_samp):
                    g.append([x1, random.choice(not_label[label])])
                    gy.append( - 1.0)
            yield np.array(g, dtype = np.int32), np.array(gy, dtype = np.float32)
Ejemplo n.º 12
0
    def get_file_dict(fns,header_prefix='') :
        file_map = dd(lambda: dd(list))
        out_fieldnames = []
        blank_entry = []
        for fn in fns :
            max_maps = 0
            f = reader(open(fn),delimiter='\t')
            #f = open(fn)
            fieldnames = f.next()
            fieldnames = fieldnames[2:] # we don't want existing knownGeneID or geneSymbol
            # read in the data, create a dictionary
            for l in f :
                if opts.symbols :
                    gene, symbol, data = l[0],l[1],l[2:]
                    symbol_map[gene] = symbol
                else :
                    gene, data = l.split('\t',1)
                file_map[fn][gene].append(data)
                max_maps = max(max_maps,len(file_map[fn][gene]))
                all_genes.add(gene)

            # if we're adding a binary column, do it
            if opts.binary_plus :
                out_fieldnames.append(header_prefix+fn+'.MAPPED')

            # construct the fieldnames for this file
            for i in range(max_maps) :
                out_fieldnames.extend(['%s%s.%d.%s'%(header_prefix,fn,i,h) for h in fieldnames])

            # pad out data entries w/ fewer than max_maps
            for gene,data in file_map[fn].items() :
                while len(data) < max_maps :
                    data.append(['']*len(fieldnames))
            file_map[fn]['blank'] = [['']*len(fieldnames) for _ in range(max_maps)]
        return file_map,out_fieldnames
Ejemplo n.º 13
0
 def gridIllumination(self, N: int, lamps: List[List[int]], queries: List[List[int]]) -> List[int]:
     result = []
     row = dd(int)
     col = dd(int)
     ru_ld = dd(int)
     lu_rd = dd(int)
     lamp_set = set()
     for lamp in lamps:
         x, y = lamp
         row[x] += 1
         col[y] += 1
         ru_ld[x-y] += 1
         lu_rd[x+y] += 1
         lamp_set.add((x, y))
     
     dx = [-1, -1, -1, 0, 0, 0, 1, 1, 1]
     dy = [-1, 0, 1, -1, 0, 1, -1, 0, 1]
     for query in queries:
         x, y = query
         if row[x] > 0 or col[y] > 0 or ru_ld[x-y] > 0 or lu_rd[x+y] > 0:
             result.append(1)
         else:
             result.append(0)
         for p, q in zip(dx, dy):
             nx = p + x
             ny = q + y
             if nx >= 0 and nx < N and ny >= 0 and ny < N and (nx, ny) in lamp_set:
                 row[nx] -= 1
                 col[ny] -= 1
                 ru_ld[nx - ny] -= 1
                 lu_rd[nx - ny] -= 1
                 lamp_set.remove((nx, ny))
     return result
Ejemplo n.º 14
0
    def predict_by_chunks(self):
        predictions = dd(dict)
        # optimization
        self.optimize()
        self.logger.info(self.clf_wrapper.classifier)

        for tw, chunks in self.dataset.viewitems():
            tw_predictions = dd(list)
            #tw_scores = []
            for i, datasets in enumerate(chunks): #test set and train set
                tr, te  = datasets
                X_train, X_test, y_train, y_test, test_inst_order = self._prepare(tw,tr,te)

                #score = 0.0
                try:
                    self.clf_wrapper.classifier.fit(X_train, y_train)
                    prediction = self.clf_wrapper.classifier.predict(X_test)
                    #score = self.clf_wrapper.classifier.score(x_test, y_test)
                except ValueError, e: # all instances are belongs to the same class
                    self.logger.warning("{}-{}: {}".format(self.clf_wrapper.name, tr, e))
                    if str(e) == "The number of classes has to be greater than one.":
                        print >> sys.stderr, "initialization", y_train[0]
                        prediction = [y_train[0]] * len(y_test)
                        #score = sum(prediction == y_test) / float(len(y_test))
                    if str(e) == "Input X must be non-negative.":
                        pass
                tw_predictions[i].extend(zip(test_inst_order, prediction))
            for i in xrange(len(chunks)):
                predictions[i][tw] = dict(tw_predictions[i])
Ejemplo n.º 15
0
 def initialize_data(self, elicited_features):
     # sets term_indices (per language, languages (set of languages, nS 
     # (number of situations) as wel as CMs (count matrices for every 
     # language; { language : nS x nT_language }
     #fn = 'data/%s/elicited_features.csv' % self.data_folder
     #with open(fn, 'r') as fh:
     #    self.elicited_features = list(csv.reader(fh))
     self.elicited_features = elicited_features
     #self.filter_data()
     #YM: Filters the data to eliminate any situations occurring only in one of the target languages.
     self.term_indices = dd(lambda : {})
     self.languages = set()
     self.nS = 0
     CM_constructor = dd(lambda : dd(lambda : dd(float)))
     #
     for language, subject, situation, word in self.elicited_features:
         self.languages.add(language)
         self.nS = np.max([self.nS, int(situation)+1])
         try:
             word_ix = self.term_indices[language][word]
         except KeyError:
             lt = len(self.term_indices[language])
             word_ix = self.term_indices[language][word] = lt
         CM_constructor[language][int(situation)][word_ix] += 1.0
     self.CMs = {language :
                     np.zeros((self.nS, len(self.term_indices[language]))) 
                  for language in self.languages}
     for language, v1 in CM_constructor.items():
         for situation, v2 in v1.items():
             for term, count in v2.items():
                 self.CMs[language][situation,term] = count
     return
Ejemplo n.º 16
0
 def __init__(self, *args, **kwargs):
     """Constructor for the corpus object. Takes a list of documents, and
     constructs the corpus.
     """
     self.__docids = set()
     self.__entities = dd(int)
     self.__entity_index = dd(list)
Ejemplo n.º 17
0
    def __init__(self,files,keys,ktype,prefix): #,strictLevel):


        self.METHOD="UNIQUE_SENSE"
        if ktype == 'binary':
            self.groupDict = {}
            self.flist = []
            for line in open(keys):
                line=line.split()
                if line[1] == '0' or line[1] == '1':
                    self.groupDict[line[0]]=int(line[1])
                    self.flist.append(line[0])
                    self.keyType = 'binary'
                else:
                    sys.stderr.write("Warning: Non-binary key supplied - assuming continuous variable\n")
                    self.keyType = 'continuous'
                    self.groupDict[line[0]] = float(line[1])
                    self.flist.append(line[0])


        self.cnt_dict = dd(lambda: dd(lambda: [0,0,0,0]))
        self.total_cnts = dd(float)
        for f in files:
            if f not in self.flist:
                print "ERROR XYZ"
                sys.exit()
            for line in open(f):
                line = line.split()
                self.cnt_dict[line[0]][f] = [float(line[1]),float(line[2]),float(line[3]),float(line[4])]
                self.total_cnts[f] = sum(self.cnt_dict[line[0]][f])
                

        self.prefix = prefix
Ejemplo n.º 18
0
 def __init__(self):
   self._userWise = dd(list)
   self._tweets = []
   self._scores = []
   self._correct = []
   self._mean = dd(float)
   self._variance = dd(float)
Ejemplo n.º 19
0
 def __init__(self):
     sys.stderr.write("Tagger: In Constructor\n")
     ## Data containers
     self.__classes = []
     self.__train = []
     self.__test = []
     self.__output = []
     self.__accuracies = []
     ## Tagger Options and Settings
     self.__workDir = "/tmp"
     self.__tntTrain = "tnt-para"
     self.__trainOptions = ""
     self.__testOptions = "-v3 -m"
     self.__tntTest = "tnt"
     self.__trainFile = self.__workDir + "/" + "train"
     self.__testFile = self.__workDir + "/" + "test"
     self.__modelFile = self.__workDir + "/" + "model"
     ## Accuracy Stuff
     self.__tags = dd(lambda: dd(int))
     self.__sameLangContext = dd(lambda: dd(int))
     self.__diffLangContext = dd(lambda: dd(int))
     self.__prevWordDiffContext = dd(lambda: dd(int))
     self.__preprevWordDiffContext = dd(lambda: dd(int))
     self.__unknownWords = 0
     self.__totalCorrect = 0
     self.__totalWords = 0
     self.__totalSents = 0
     self.__correctSents = 0
Ejemplo n.º 20
0
def main():
    parser = ap.ArgumentParser()
    parser.add_argument('eids')
    args = parser.parse_args()

    improvements = dd(lambda: dd(list))

    for fn, args, duration, result in rtk.dist.db.iter_results(args.eids):
        for r in result:
            for k, v in r.items():
                improvements[k][args['rand_seed']].append(v)

    for k in improvements.keys():
        all_results = np.vstack(improvements[k].values())
        assert all_results.shape == (len(improvements[k]), len(improvements[k].values()[0]))
        improvements[k] = (all_results.mean(axis=0), all_results.std(axis=0))

    x = range(1, improvements.iteritems().next()[1][0].shape[0]+1)
    for k, (mean, err) in improvements.iteritems():
        if 'vae' in k: continue
        plt.errorbar(x, mean, yerr=err, label=k)
    plt.axvline(x=5, color='purple')
    plt.axhline(y=0.25, color='green')
    plt.legend(loc='center right')
    plt.title('Performance Metrics for CRF Semi-supervised')
    plt.xlabel('Iteration #')
    plt.savefig('crf.pdf')
Ejemplo n.º 21
0
 def compilePairs(self):
     tmpPairs = [];  self.endTable = dd(list); self.startTable=dd(list)
     for p in self.knownJxns:
         tmpPairs += [(p[i-1][1],p[i][0]) for i in range(1,len(p))]
     self.validPairs = sorted([t for t in set(tmpPairs)])
     for t in self.validPairs:
         self.endTable[t[0]].append(t[1]); self.startTable[t[1]].append(t[0])
Ejemplo n.º 22
0
def get_mapped_senses(gold_instance_dict, sys_instance_dict, train_test_splits):
    mapped_senses = dict()
    for train, test in train_test_splits:

        gold_train_senses = get_max_sense(gold_instance_dict, train)
        sys_train_senses = get_max_sense(sys_instance_dict, train)

        d = dd(lambda: dd(int))
        # build the sense mapping matrix. Each system-gold occurrence equals
        # 1 point.
        for g_sense, s_sense in izip(gold_train_senses, sys_train_senses):
            d[s_sense][g_sense] += 1

        # Make majority vote among gold senses for each system sense occurred
        # together with the same instances.
        mapping = dict([(s_sense, max(d[s_sense].items(), key=lambda e: e[1])[0])
                        for s_sense in d])
        LOGGER.debug("Mapping: %s", mapping)

        # Map test senses. If mapping doesn't contain any sense mapping for
        # particular system sense; skip it.
        sys_test_senses = get_max_sense(sys_instance_dict, test)
        mapped_test_chunk = dict([(instance, mapping[s_sense]) for instance, s_sense
                             in izip(test, sys_test_senses) if s_sense in mapping])
        mapped_senses.update(mapped_test_chunk)

    return mapped_senses
Ejemplo n.º 23
0
 def __init__(self, dataFile):
   self.__data = []
   self.__commWiseIndices = {}
   self.__commWiseSampleIndices = {}
   self.__commWiseSampleWordFreq = dd(lambda:dd(int))
   self.__read(dataFile)
   self._tok = Tokenizer(preserve_case=False)
Ejemplo n.º 24
0
def accuracy(test, output):
    outputLines = [cmp(float(l.strip()),0) for l in open(output)]
    testLines = [l.strip() for l in open(test)]
    classDict = {1:'black', -1:'jewish'}
    A = 0
    Correct = dd(int)
    Actual = dd(int)
    Given = dd(int)
    for index in range(len(outputLines)):
        
        testClass = int(testLines[index].split()[0])
        outputClass = outputLines[index]
        if testClass == outputClass:
            A += 1
            Correct[classDict[testClass]] += 1
        Given[classDict[outputClass]] += 1
        Actual[classDict[testClass]] += 1
            
    A = str(round(A*100.0/len(testLines),2))
    PBlack = str(round(Correct['black'] *100.0/Given['black'],2))
    RBlack = str(round(Correct['black'] *100.0/Actual['black'],2))    
    PJew = str(round(Correct['jewish'] *100.0/Given['jewish'],2))
    RJew = str(round(Correct['jewish'] *100.0/Actual['jewish'],2))
    #content = open(output).read()
    #A = content.split('Accuracy on test set:')[1].split('(')[0].strip()
    #P = content.split('Precision/recall on test set:')[1].split('/')[0].strip()
    #C = content.split('Precision/recall on test set:')[1].split('/')[1].strip()
    
    return A, PBlack, RBlack, PJew, RJew
Ejemplo n.º 25
0
 def preparePostsSingleDoc(self, outputFile):
   outputFile = open(outputFile,'w')
   backGroundVector = dd(int)
   for key in self.__commWiseSampleIndices.iterkeys():
     for index in self.__commWiseSampleIndices[key]:
       tokens = self._tokenize(self.__data[index][1])
       freqVector = self.freqVector(tokens)
       for token, freq in freqVector.iteritems():
         backGroundVector[token] += freq
   
   print "Background words:",len(backGroundVector)
   filteredLexicon = self.__filterWords(backGroundVector)
   print "Filtered Words:",len(filteredLexicon)
   ##self.analyzeLexicon(filteredLexicon, backGroundVector)
   ##sys.exit()
   
   for key in self.__commWiseSampleIndices.iterkeys():
     globalFreqVector = dd(int)
     for index in self.__commWiseSampleIndices[key]:
       tokens = self._tokenize(self.__data[index][1])
       freqVector = self.freqVector(tokens)
       for word, freq in freqVector:
         globalFreqVector[word] += freq 
     words = [x+"$:$:"+str(y) for x,y in globalFreqVector.iteritems() if x in filteredLexicon]
     if len(words) > 0:
       outputFile.write(key+'\t'+'  '.join(words)+'\n')
   outputFile.write('background'+'\t'+'  '.join([x+"$:$:"+str(y) for x,y in backGroundVector.iteritems() if x in filteredLexicon])+'\n')
   outputFile.close()
Ejemplo n.º 26
0
def get_weight_matrix(cites, indices):
    id2index = {}
    for i, id in enumerate(indices):
        id2index[id] = i

    pair_cnt = dd(int)

    cited = dd(list)
    for c1, c2s in cites.iteritems():
        for c2 in c2s:
            cited[c2].append(c1)
        for ii in c2s:
            for jj in c2s:
                if ii == jj or ii not in id2index or jj not in id2index: continue
                i, j = id2index[ii], id2index[jj]
                pair_cnt[(i, j)] += 1

    for c1, c2s in cited.iteritems():
        for ii in c2s:
            for jj in c2s:
                if ii == jj or ii not in id2index or jj not in id2index: continue
                i, j = id2index[ii], id2index[jj]
                pair_cnt[(i, j)] += 1

    row, col, data = [], [], []
    for k, v in pair_cnt.iteritems():
        i, j = k
        row.append(i)
        col.append(j)
        data.append(v)

    row, col, data = np.array(row), np.array(col), np.array(data, dtype = np.float32)
    w = sparse.coo_matrix((data, (row, col)), shape = (len(indices), len(indices))).tocsr()
    return w
Ejemplo n.º 27
0
def createPatternMap(stretchyPatterns):
    
    print "Creating pattern map"
    
    stretchyPatterns = open(stretchyPatterns)
    patternMap = dd(lambda: dd(int))
    
    count = 0
    while 1:
        
        stretchyLine = stretchyPatterns.readline().strip()
        
        if stretchyLine == "":
            break
        
        for p in set(stretchyLine.split("\t")[1:]):
            patternMap[p][stretchyLine.split("\t")[0]] += 1
            
        count += 1
        
        try:
            dummy = 1/(count%5000)
        except:
            print count,
    
    print
    
    return patternMap
Ejemplo n.º 28
0
def assign_orders_and_drones_to_warehouses(simulation_parameters, weights, warehouse_info, order_info, order_location_matrix):
    #
    # print order_location_matrix
    kmeans, cluster_assignments = find_centers(order_location_matrix, len(warehouse_info))
    warehouse_location_matrix = np.array([w['loc'] for w in warehouse_info.values()])
    # print warehouse_location_matrix
    # print kmeans.predict(warehouse_location_matrix)

    cluster_to_warehouses = find_closest_warehouse(kmeans.cluster_centers_, warehouse_location_matrix)
    warehouses_to_orders = dd(lambda: dict())
    warehouses_order_ids = dd(lambda: dict())
    warehouses_drone_numbers = dd(lambda: dict())
    total_order_weight = sum([order['weight'] for order in order_info.values()])
    # print total_order_weight
    # print simulation_parameters[2]
    for (cluster, warehouse) in enumerate(cluster_to_warehouses):
        assigned_orders = [order for (order, assignment) in enumerate(cluster_assignments) if assignment == cluster]
        total_order_weight_for_warehouse = sum([order_info[order]['weight'] for order in assigned_orders])
        # print total_order_weight_for_warehouse
        warehouses_to_orders[warehouse]['orders'] = [order for order in assigned_orders]
        warehouses_order_ids[warehouse] = [order for order in assigned_orders]
        warehouses_to_orders[warehouse]['n_drones'] = floor(float(total_order_weight_for_warehouse)/total_order_weight*simulation_parameters[2])
        warehouses_drone_numbers[warehouse] = [None, None]
        warehouses_drone_numbers[warehouse][0] = floor(float(total_order_weight_for_warehouse)/float(total_order_weight+len(warehouse_info))*simulation_parameters[2])
        warehouses_drone_numbers[warehouse][1] = floor((1-float(total_order_weight_for_warehouse)/float(total_order_weight+len(warehouse_info)))*simulation_parameters[2])
        # print warehouses_to_orders[warehouse]
        # print len(warehouses_to_orders[warehouse])

    #print warehouses_to_orders
    return warehouses_drone_numbers, warehouses_order_ids
Ejemplo n.º 29
0
def prepareSpeakerBasedData():
    print 'Prepaaring speaker based Data..'
    tweetDir = '/usr0/home/pgadde/Work/Ethnic/WordsBasedFiltering/MapReduce/ScreenNamesBased/'
    blackTweets = open(tweetDir + 'blackTweets')
    jewishTweets = open(tweetDir + 'jewishTweets')
    blackOutput = open(tweetDir + 'blackTweets.sp','w')
    jewishOutput = open(tweetDir + 'jewishTweets.sp','w')
    
    blackSpeakers = dd(list)
    for tweet in blackTweets:
        tweet = tweet.strip().split('\t')
        screenName = tweet[1]
        content = tweet[-1]
        blackSpeakers[screenName].append(content)
    for speaker in blackSpeakers.iterkeys():
        blackOutput.write(speaker+'\t'+'\t'.join(blackSpeakers[speaker])+'\n')
    blackOutput.close()
    
    jewishSpeakers = dd(list)
    for tweet in jewishTweets:
        tweet = tweet.strip().split('\t')
        screenName = tweet[1]
        content = tweet[-1]
        jewishSpeakers[screenName].append(content)
    for speaker in jewishSpeakers.iterkeys():
        jewishOutput.write(speaker+'\t'+'\t'.join(jewishSpeakers[speaker])+'\n')    
    jewishOutput.close()
Ejemplo n.º 30
0
def aggregate(golden, guesses):
    """ Aggregates over the results """
    
    breakdown = dd(lambda : (0.0, 0.0, 0.0, 0.0))
    breakdown_N = dd(int)
    N = 0
    A, L, NL, R = 0.0, 0.0, 0.0, 0.0
    for tag, gold in golden.items():
        
        if tag in guesses:
            guess = guesses[tag]
            # assumes only 1 golden analysis
            if len(gold) > 1:
                for elem in gold[1:]:
                    assert elem == gold[0]
                    
            acc, lev, rank = evaluate(gold[0], guess)
            A, L, NL, R = A+acc, L+lev, NL+(lev / len(gold[0])), R+rank
            # compute results broken down by POS tag
            pos = tag[-1].split(",")[0].replace("pos=", "")
            _A, _L, _NL, _R = breakdown[pos]
            breakdown[pos] = _A+acc, _L+lev, _NL+(lev / len(gold[0])), _R+rank
            breakdown_N[pos] += 1
            
        else:
            sys.stderr.write("warning: no guess provided for (%s)\n" % " ".join(tag))
        N += 1

    return A/N, L/N, NL/N, R/N, breakdown, breakdown_N
"""
    This is an efficient algorithm to find the size of a subtree from every node in O(n) time.
    The idea is to use one dfs and first calculate the size of subtree of children of a node recursively.
    Then add the size of each subtree of its children to get the size of its subtree.
"""
from collections import defaultdict as dd


def dfs(source, parent):
    # Initial size of root is 1
    size[source] = 1
    for child in graph[source]:
        if child != parent:
            # Recursively calculate size of subtree of children nodes
            dfs(child, source)
            # Adding size of each child's subtree.
            size[source] += size[child]


size = dd(int)
graph = dd(set)
n = int(input())
for i in range(n - 1):
    u, v = map(int, input().split())
    graph[u].add(v)
    graph[v].add(u)
dfs(1, 0)
print(size)
Ejemplo n.º 32
0
# changes requests_url_count dict to a list of tuples (url, count),
# sorts it in a proper way and prints
def print_result(requests_url_count, invalid_lines_count):
    url_count_list = [(url, count)
                      for url, count in requests_url_count.items()]
    # sort - count descending, url lexicographically
    url_count_list.sort(key=lambda p: (-p[1], p[0]))
    for url, count in url_count_list:
        print('"{0}",{1}'.format(url, count))
    if invalid_lines_count > 0:
        print('\nInvalid log lines: {0}'.format(invalid_lines_count),
              file=sys.stderr)


if __name__ == '__main__':
    if len(sys.argv) != 2:
        print('Usage: python page_report.py <path-to-log-file>\n')
    else:
        path = sys.argv[1]
        with open(path, 'r') as logs:
            requests_url_count = dd(int)
            invalid_lines_count = 0
            for line in logs:
                try:
                    stripped_url = parse_line(line)
                    requests_url_count[stripped_url] += 1
                except ValueError:
                    invalid_lines_count += 1

            print_result(requests_url_count, invalid_lines_count)
Ejemplo n.º 33
0
], dtype=float)


def _get_colors(num_colors):
    colors = []
    for i in np.arange(0., 360., 360. / num_colors):
        hue = i / 360.
        lightness = (50 + np.random.rand() * 10) / 100.
        saturation = (90 + np.random.rand() * 10) / 100.
        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
    return colors


#for iter in range(itertime):
kNN = 5
sga2sgaAaff = dd(set)
sgaAsga2aff = dd(int)
set_gene = set()
path = 'sga2sgaAff_baseline_brca.txt'
f = open(path, 'r')
pathout = 'out.txt'
fo = open(pathout, 'w')
next(f)
for line in f:
    l = line.strip().split('\t')
    sga1, sga2, aff = l[0], l[1], int(l[2])
    sga2sgaAaff[sga1].add((sga2, aff))
    #sgaAsga2aff[(sga1,sga2)] = aff

    set_gene.add(sga1)
    set_gene.add(sga2)
from math import sqrt, log, log2
from fractions import Fraction
import random

t = int(input())
for _ in range(t):
    n, k = map(int, input().split())
    nums = list(map(int, stdin.readline().split()))

    # n = 1000
    # k = random.randint(1, 10)
    # nums = []
    # for j in range(n):
    #     nums.append(random.randint(1, 10))

    freq = dd(int)
    maxkey, maxval = -1, 0
    for i in nums:
        t = i % k
        if t != 0:
            freq[k - t] += 1
            if freq[k - t] > maxval:
                maxval = freq[k - t]

    maxkey = 0
    for key in freq:
        if freq[key] == maxval and key > maxkey:
            maxkey = key

    if len(freq) == 0:
        print(0)
Ejemplo n.º 35
0
 def doc2bow_from_word_ids(document):
     counter = dd(int)
     for word_idx in document:
         counter[word_idx] += 1
     document_bow = sorted(iteritems(counter))
     return document_bow
from collections import defaultdict as dd

d = dd(int)

n = int(input())
for _ in range(n):
    a, x = map(int, input().split())
    d[a] = x
m = int(input())
for _ in range(m):
    b, y = map(int, input().split())
    if y > d[b]:
        d[b] = y
print(sum(d.values()))
Ejemplo n.º 37
0
    def setup(self):
        self.train_test = np.append(self.data['train']['x_t'],
                                    self.data['test']['x_t'], 0)
        self.train_losses = dd(list)
        self.test_losses = dd(list)
        self.model_losses = []

        # Location of 0/-1 in the transformed space
        self.zero_line = self.model.scalery.inverse_transform(
            np.zeros((1, self.data['train']['y_t'].shape[-1])))
        self.neg_line = self.model.scalery.inverse_transform(
            np.zeros((1, self.data['train']['y_t'].shape[-1])) - 1)

        if self.args.darktheme:
            plt.style.use('dark_background')

        n_ext = 3  # extra rows, in addition to 1-1 scatter plots
        n_col = min(5, self.data['test']['y'].shape[1])
        n_row = n_ext + (n_col + n_col - 1) // n_col
        fig = plt.figure(figsize=(5 * n_col, 2 * n_row))
        meta = enumerate(GridSpec(n_row, 1, hspace=0.35))
        conts = [
            GridSubplot(1,
                        2 if i in [0, n_row - 1, n_row - 2] else n_col,
                        subplot_spec=o,
                        wspace=0.3 if i else 0.45) for i, o in meta
        ]
        axs = [
            plt.Subplot(fig, sub) for container in conts for sub in container
        ]
        axs = axs[:n_col + 2] + axs[-4:]
        [fig.add_subplot(ax) for ax in axs]

        self.axes = [ax.twinx() for ax in axs[:2]] + axs
        self.labels = get_labels(get_sensor_bands(self.args.sensor, self.args),
                                 self.model.output_slices, n_col)[:n_col]

        plt.ion()
        plt.show()
        plt.pause(1e-9)

        if self.args.animate:
            ani_path = Path('Animations')
            ani_tmp = ani_path.joinpath('tmp')
            ani_tmp.mkdir(parents=True, exist_ok=True)
            list(map(os.remove, ani_tmp.glob(
                '*.png')))  # Delete any prior run temporary animation files

            # '-tune zerolatency' fixes issue where firefox won't play the mp4
            # '-vf pad=...' ensures height/width are divisible by 2 (required by .h264 - https://stackoverflow.com/questions/20847674/ffmpeg-libx264-height-not-divisible-by-2)
            extra_args = [
                "-tune", "zerolatency", "-vf",
                "pad=width=ceil(iw/2)*2:height=ceil(ih/2)*2:color=white"
            ]
            ani_writer = self.ani_writer = animation.writers['ffmpeg_file'](
                fps=3, extra_args=extra_args)
            ani_writer.setup(fig,
                             ani_path.joinpath('MDN.mp4').as_posix(),
                             dpi=100,
                             frame_prefix=ani_tmp.joinpath('_').as_posix(),
                             clear_temp=False)
Ejemplo n.º 38
0
 def __init__(self):
     self.d = dd()
Ejemplo n.º 39
0
def get_estimates(args,
                  x_train=None,
                  y_train=None,
                  x_test=None,
                  y_test=None,
                  output_slices=None,
                  dataset_labels=None,
                  x_sim=None,
                  y_sim=None,
                  return_model=False,
                  return_coefs=False):
    ''' 
	Estimate all target variables for the given x_test. If a model doesn't 
	already exist, creates a model with the given training data. 
	'''
    # Add x/y scalers to the args object
    generate_scalers(args, x_train, x_test)

    if args.verbose:
        print(
            f'\nUsing {len(args.wavelengths)} wavelength(s) in the range [{args.wavelengths[0]}, {args.wavelengths[-1]}]'
        )
        if x_train is not None: print_dataset_stats(x=x_train, label='Train')
        if y_train is not None: print_dataset_stats(y=y_train, label='Train')
        if x_test is not None: print_dataset_stats(x=x_test, label='Test')
        if y_test is not None: print_dataset_stats(y=y_test, label='Test')

    # Add a few additional variables to be stored in the generated config file
    if x_train is not None: setattr(args, 'data_xtrain_shape', x_train.shape)
    if y_train is not None: setattr(args, 'data_ytrain_shape', y_train.shape)
    if x_test is not None: setattr(args, 'data_xtest_shape', x_test.shape)
    if y_test is not None: setattr(args, 'data_ytest_shape', y_test.shape)
    if dataset_labels is not None:
        sets_str = ','.join(sorted(map(str, np.unique(dataset_labels))))
        sets_hash = hashlib.sha256(sets_str.encode('utf-8')).hexdigest()
        setattr(args, 'datasets_hash', sets_hash)

    model_path = generate_config(args, create=x_train is not None)
    args.config_name = model_path.name

    predict_kwargs = {
        'avg_est': getattr(args, 'avg_est', False),
        'threshold': getattr(args, 'threshold', None),
        'confidence_interval': getattr(args, 'CI', None),
        'use_gpu': getattr(args, 'use_gpu', False),
        'chunk_size': getattr(args, 'chunk_size', 1e5),
        'return_coefs': True,
    }

    x_full, y_full = x_train, y_train
    x_valid, y_valid = None, None

    outputs = dd(list)
    for round_num in trange(args.n_rounds,
                            disable=args.verbose or (args.n_rounds == 1)
                            or args.silent):
        args.curr_round = round_num
        curr_round_seed = args.seed + round_num if args.seed is not None else None
        np.random.seed(curr_round_seed)

        # 75% of rows used in bagging
        if using_feature(
                args, 'bagging') and x_train is not None and args.n_rounds > 1:
            (x_train, y_train), (x_valid,
                                 y_valid) = split_data(x_full,
                                                       y_full,
                                                       n_train=0.75,
                                                       seed=curr_round_seed)

        datasets = {
            k: dict(zip(['x', 'y'], v))
            for k, v in {
                'train': [x_train, y_train],
                'valid': [x_valid, y_valid],
                'test': [x_test, y_test],
                'full': [x_full, y_full],
                'sim': [x_sim, y_sim],
            }.items() if v[0] is not None
        }

        model_kwargs = {
            'n_mix':
            args.n_mix,
            'hidden': [args.n_hidden] * args.n_layers,
            'lr':
            args.lr,
            'l2':
            args.l2,
            'n_iter':
            args.n_iter,
            'batch':
            args.batch,
            'imputations':
            args.imputations,
            'epsilon':
            args.epsilon,
            'scalerx':
            TransformerPipeline(
                [S(*args, **kwargs) for S, args, kwargs in args.x_scalers]),
            'scalery':
            TransformerPipeline(
                [S(*args, **kwargs) for S, args, kwargs in args.y_scalers]),
            'model_path':
            model_path.joinpath(f'Round_{round_num}'),
            'no_load':
            args.no_load,
            'no_save':
            args.no_save,
            'seed':
            curr_round_seed,
            'verbose':
            args.verbose,
        }

        model = MDN(**model_kwargs)
        model.fit(x_train,
                  y_train,
                  output_slices,
                  args=args,
                  datasets=datasets)

        if return_model:
            outputs['model'].append(model)

        if return_coefs:
            outputs['scalerx'].append(model.scalerx)
            outputs['scalery'].append(model.scalery)

        if x_test is not None:
            (estimates,
             *confidence), coefs = model.predict(x_test, **predict_kwargs)
            outputs['estimates'].append(estimates)

            if return_coefs:
                outputs['coefs'].append(coefs)

            if len(confidence):
                upper, lower = confidence
                outputs['upper_bound'].append(upper)
                outputs['lower_bound'].append(lower)

            if args.verbose and y_test is not None:
                median = np.median(outputs['estimates'], axis=0)
                labels = get_labels(args.wavelengths,
                                    output_slices,
                                    n_out=y_test.shape[1])
                for lbl, y1, y2 in zip(labels, y_test.T, median.T):
                    print(performance(f'{lbl:>7s} Median', y1, y2))
                print(f'--- Done round {round_num} ---\n')

        if hasattr(model, 'session'): model.session.close()

    # Create compressed model archive
    compress(model_path)

    if len(outputs) == 1:
        outputs = list(outputs.values())[0]
    return outputs, model.output_slices
Ejemplo n.º 40
0
    def summarize_sample_pairs(self):

        from modules.Rage_Plots import rage_subplots

        xLen, yLen = 5, 5
        subplot = rage_subplots.subplot(xLen, yLen, True)
        total_features = len(self.input.features)
        f_num = 1
        LOG = True
        feature_sample_ranks = dd(lambda: dd(float))
        for s in self.input.samples:
            for i, (b, a) in enumerate(
                    sorted([(b, a)
                            for (a, b) in self.input.sample_vals[s].items()])):
                if i == 0: match, rank, m_list = b, 1, [a]
                elif b == match: m_list.append(a)
                else:
                    for m in m_list:
                        feature_sample_ranks[s][m] = rank
                    match, rank, m_list = b, rank + 1, [a]
            feature_sample_ranks
            for m in m_list:
                feature_sample_ranks[s][m] = rank
        f_num = 1
        fig = matplotlib.pyplot.gcf()
        fig.set_size_inches(18.5, 9.5)
        s_id = ''
        for i in range(len(self.input.samples)):
            for j in range(i + 1, len(self.input.samples)):

                s1, s2 = self.input.samples[i], self.input.samples[j]
                fr1, fr2 = feature_sample_ranks[s1], feature_sample_ranks[s2]
                fkeys = list(set(fr1.keys() + fr2.keys()))
                f_order = [
                    x[1] for x in sorted([(fr1[f] + fr2[f], f) for f in fkeys])
                ]
                x_range = range(len(f_order))
                v1 = [
                    log(1.0 + self.input.sample_vals[s1][f])
                    if f in self.input.sample_vals[s1] else 0 for f in f_order
                ]
                v2 = [
                    log(1.0 + self.input.sample_vals[s2][f])
                    if f in self.input.sample_vals[s2] else 0 for f in f_order
                ]

                vs1 = scale_vals(v1)
                vs2 = scale_vals(v2)
                sv1 = svgf(vs1, 61, 2, mode='nearest')
                sv2 = svgf(vs2, 61, 2, mode='nearest')
                subplot.add_line(x_range, sv1, {'lw': 0.2})
                subplot.add_line(x_range, sv2, {'lw': 0.2})
                sv_mix = [(sv1[z] + sv2[z]) / 2.0 for z in range(len(sv1))]

                step1, step2 = 50, 100
                subplot.add_line(x_range, sv_mix, {'lw': 0.5, 'color': 'k'})
                z_diffs, z_steps = [], []
                for z in range(step2, len(sv_mix), step1):
                    z1 = sv1[z - step2:z + step2]
                    z2 = sv2[z - step2:z + step2]
                    z_diffs.append(
                        sum([(z1[x] - z2[x]) * (z1[x] - z2[x])
                             for x in range(len(z1))]))
                    z_steps.append((z - step2, z + step2))

                    #subplot.add_line(x_range[z-step2:z+step2],sv_mix[z-step2:z+step2],{'color': 'purple','alpha':0.4})
                diff_colors = get_colors(z_diffs, plt.cm.jet)
                for z in range(len(z_steps)):
                    zA, zB = z_steps[z]
                    subplot.add_line(x_range[zA:zB], sv_mix[zA:zB], {
                        'color': diff_colors[z],
                        'alpha': 0.5,
                        'lw': 1
                    })

                #subplot.change_limits({'x1': int(len(x_range)*1.08), 'y0': -0.05,'y1': 0.93})
                subplot.ax.text(int(len(x_range) * 0.03),
                                0.72,
                                s1 + '  ' + s_id + ' ' + s2 + ' ' + s_id,
                                color='red')
                #subplot.ax.plot([0,len(x_range)],[0,0],color='k',linewidth=1,zorder=2)
                if not subplot.update():
                    plt.suptitle('Pair Comparison')
                    plt.subplots_adjust(left=0.04,
                                        bottom=0.01,
                                        right=0.96,
                                        top=0.95,
                                        wspace=0.03,
                                        hspace=0.03)
                    fig.savefig('pairs_out' + str(f_num) + '.png', dpi=100)
                    f_num += 1
                    if f_num > 10: sys.exit()

        sys.exit()
Ejemplo n.º 41
0
def search_omw(lang=None, q=None):

    if lang and q:
        lang_id = lang
        lang_id2 = lang2
        query = q
    else:
        lang_id = request.form['lang']
        lang_id2 = request.form['lang2']
        query = request.form['query']
        query = query.strip()

    sense = dd(list)
    lang_sense = dd(lambda: dd(list))

    # GO FROM FORM TO SENSE
    for s in query_omw(
            """
        SELECT s.id as s_id, ss_id,  wid, fid, lang_id, pos_id, lemma
        FROM (SELECT w_id as wid, form.id as fid, lang_id, pos_id, lemma
              FROM (SELECT id, lang_id, pos_id, lemma
                    FROM f WHERE lemma GLOB ? AND lang_id in (?,?)) as form
              JOIN wf_link ON form.id = wf_link.f_id) word
        JOIN s ON wid=w_id
        """, [
                '[' + query[0].upper() + query[0].lower() + ']' + query[1:],
                lang_id, lang_id2
            ]):

        sense[s['ss_id']] = [
            s['s_id'], s['wid'], s['fid'], s['lang_id'], s['pos_id'],
            s['lemma']
        ]

        lang_sense[s['lang_id']][s['ss_id']] = [
            s['s_id'], s['wid'], s['fid'], s['pos_id'], s['lemma']
        ]

    pos = fetch_pos()
    lang_dct, lang_code = fetch_langs()
    ss, senses, defs, exes, links = fetch_ss_basic(sense.keys())

    labels = fetch_labels(lang_id, set(senses.keys()))

    resp = make_response(
        render_template('omw_results.html',
                        langsel=int(lang_id),
                        langsel2=int(lang_id2),
                        pos=pos,
                        lang_dct=lang_dct,
                        sense=sense,
                        senses=senses,
                        ss=ss,
                        links=links,
                        defs=defs,
                        exes=exes,
                        labels=labels))

    resp.set_cookie('selected_lang', lang_id)
    resp.set_cookie('selected_lang2', lang_id2)
    return resp
Ejemplo n.º 42
0
def makemut(args, hc, avoid, alignopts):

    mutid_list = []
    for site in hc:
        mutid_list.append(site['chrom'] + '_' + str(site['start']) + '_' +
                          str(site['end']) + '_' + str(site['vaf']) + '_' +
                          str(site['altbase']))

    try:
        if args.seed is not None:
            random.seed(int(args.seed) + int(hc[0]['start']))

        bamfile = pysam.Samfile(args.bamFileName, 'rb')
        bammate = pysam.Samfile(
            args.bamFileName, 'rb')  # use for mates to avoid iterator problems
        reffile = pysam.Fastafile(args.refFasta)
        tmpbams = []

        #snvfrac = float(args.snvfrac)

        chrom = None
        vaf = None

        mutpos_list = []
        altbase_list = []

        for site in hc:
            if chrom is None:
                chrom = site['chrom']
            else:
                assert chrom == site[
                    'chrom'], "haplotype clusters cannot span multiple chromosomes!"

            if vaf is None:
                vaf = site['vaf']

            elif vaf != site['vaf']:
                logger.warning(
                    "multiple VAFs for single haplotype, using first encountered VAF: %f"
                    % vaf)

            mutpos = int(random.uniform(site['start'], site['end'] +
                                        1))  # position of mutation in genome
            mutpos_list.append(mutpos)
            altbase_list.append(site['altbase'])

        mutbase_list = []
        refbase_list = []
        mutstr_list = []

        for n, mutpos in enumerate(mutpos_list):
            refbase = reffile.fetch(chrom, mutpos - 1, mutpos)
            altbase = altbase_list[n]
            refbase_list.append(refbase)

            if altbase == refbase.upper() and not args.ignoreref:
                logger.warning(
                    "%s specified ALT base matches reference, skipping mutation"
                    % mutid_list[n])
                return None

            try:
                mutbase = mut(refbase, altbase)
                mutbase_list.append(mutbase)

            except ValueError as e:
                logger.warning(mutid_list[n] + " " + ' '.join(
                    ("skipped site:", chrom, str(hc[n]['start']),
                     str(hc[n]['end']), "due to N base:", str(e), "\n")))
                return None

            mutstr_list.append(refbase + "-->" + str(mutbase))

        # optional CNV file
        cnv = None
        if (args.cnvfile):
            cnv = pysam.Tabixfile(args.cnvfile, 'r')

        hapstr = "_".join(
            ('haplo', chrom, str(min(mutpos_list)), str(max(mutpos_list))))
        log = open(
            'addsnv_logs_' + os.path.basename(args.outBamFile) + '/' +
            os.path.basename(args.outBamFile) + "." + hapstr + ".log", 'w')

        tmpoutbamname = args.tmpdir + "/" + hapstr + ".tmpbam." + str(
            uuid4()) + ".bam"
        logger.info("%s creating tmp bam: %s" % (hapstr, tmpoutbamname))
        outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile)

        mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate(
            args,
            log,
            bamfile,
            bammate,
            chrom,
            min(mutpos_list),
            max(mutpos_list) + 1,
            mutpos_list,
            avoid=avoid,
            mutid_list=mutid_list,
            is_snv=True,
            mutbase_list=mutbase_list,
            reffile=reffile)

        if mutfail:
            outbam_muts.close()
            os.remove(tmpoutbamname)
            return None

        # pick reads to change
        readlist = []
        for extqname, read in outreads.iteritems():
            if read.seq != mutreads[extqname]:
                readlist.append(extqname)

        logger.info("%s len(readlist): %s" % (hapstr, str(len(readlist))))
        readlist.sort()
        random.shuffle(readlist)

        if len(readlist) < int(args.mindepth):
            logger.warning("%s too few reads in region (%s) skipping..." %
                           (hapstr, str(len(readlist))))
            outbam_muts.close()
            os.remove(tmpoutbamname)
            return None

        if vaf is None:
            vaf = float(
                args.mutfrac
            )  # default minor allele freq if not otherwise specified
        if cnv:  # cnv file is present
            if chrom in cnv.contigs:
                for cnregion in cnv.fetch(chrom, min(mutpos_list),
                                          max(mutpos_list) + 1):
                    cn = float(cnregion.strip().split()
                               [3])  # expect chrom,start,end,CN
                    logger.info(hapstr + "\t" +
                                ' '.join(("copy number in snp region:", chrom,
                                          str(min(mutpos_list)),
                                          str(max(mutpos_list)), "=",
                                          str(cn))))
                    if float(cn) > 0.0:
                        vaf = 1.0 / float(cn)
                    else:
                        vaf = 0.0
                    logger.info("%s adjusted VAF: %f" % (hapstr, vaf))
        else:
            logger.info("%s selected VAF: %f" % (hapstr, vaf))

        lastread = int(len(readlist) * vaf)

        # pick at least args.minmutreads if possible
        if lastread < int(args.minmutreads):
            if len(readlist) > int(args.minmutreads):
                lastread = int(args.minmutreads)
                logger.warning("%s forced %d reads." % (hapstr, lastread))
            else:
                logger.warning(
                    "%s dropped site with fewer reads than --minmutreads" %
                    hapstr)
                os.remove(tmpoutbamname)
                return None

        readtrack = dd(list)

        for readname in readlist:
            orig_name, readpos, pairend = readname.split(',')
            readtrack[orig_name].append('%s,%s' % (readpos, pairend))

        usedreads = 0
        newreadlist = []

        for orig_name in readtrack:
            for read_instance in readtrack[orig_name]:
                newreadlist.append(orig_name + ',' + read_instance)
                usedreads += 1

            if usedreads >= lastread:
                break

        readlist = newreadlist

        logger.info("%s picked: %d" % (hapstr, len(readlist)))

        wrote = 0
        nmut = 0
        mut_out = {}
        # change reads from .bam to mutated sequences
        for extqname, read in outreads.iteritems():
            if read.seq != mutreads[extqname]:
                if not args.nomut and extqname in readlist:
                    qual = read.qual  # changing seq resets qual (see pysam API docs)
                    read.seq = mutreads[extqname]  # make mutation
                    read.qual = qual
                    nmut += 1
            if (not hasSNP) or args.force:
                wrote += 1
                mut_out[extqname] = read

        muts_written = {}

        for extqname in mut_out:
            if extqname not in muts_written:
                outbam_muts.write(mut_out[extqname])
                muts_written[extqname] = True

                if mutmates[extqname] is not None:
                    # is mate also in mutated list?
                    mate_read = mutmates[extqname]

                    pairname = 'F'  # read is first in pair
                    if mate_read.is_read2:
                        pairname = 'S'  # read is second in pair
                    if not mate_read.is_paired:
                        pairname = 'U'  # read is unpaired

                    mateqname = ','.join(
                        (mate_read.qname, str(mate_read.pos), pairname))

                    if mateqname in mut_out:
                        # yes: output mutated mate
                        outbam_muts.write(mut_out[mateqname])
                        muts_written[mateqname] = True

                    else:
                        # no: output original mate
                        outbam_muts.write(mate_read)

        logger.info("%s wrote: %d, mutated: %d" % (hapstr, wrote, nmut))

        if not hasSNP or args.force:
            outbam_muts.close()

            aligners.remap_bam(args.aligner,
                               tmpoutbamname,
                               args.refFasta,
                               alignopts,
                               mutid=hapstr,
                               paired=(not args.single),
                               picardjar=args.picardjar,
                               insane=args.insane)

            outbam_muts = pysam.Samfile(tmpoutbamname, 'rb')
            coverwindow = 1
            incover = countReadCoverage(bamfile, chrom,
                                        min(mutpos_list) - coverwindow,
                                        max(mutpos_list) + coverwindow)
            outcover = countReadCoverage(outbam_muts, chrom,
                                         min(mutpos_list) - coverwindow,
                                         max(mutpos_list) + coverwindow)

            avgincover = float(sum(incover)) / float(len(incover))
            avgoutcover = float(sum(outcover)) / float(len(outcover))

            logger.info("%s avgincover: %f, avgoutcover: %f" %
                        (hapstr, avgincover, avgoutcover))

            spikein_snvfrac = 0.0
            if wrote > 0:
                spikein_snvfrac = float(nmut) / float(wrote)

            # qc cutoff for final snv depth
            if (avgoutcover > 0 and avgincover > 0 and avgoutcover / avgincover
                    >= float(args.coverdiff)) or args.force:
                tmpbams.append(tmpoutbamname)
                for n, site in enumerate(hc):
                    snvstr = chrom + ":" + str(site['start']) + "-" + str(
                        site['end']) + " (VAF=" + str(vaf) + ")"
                    log.write("\t".join(("snv", snvstr,
                                         str(mutpos_list[n]), mutstr_list[n],
                                         str(avgoutcover), str(avgoutcover),
                                         str(spikein_snvfrac), str(maxfrac))) +
                              "\n")
            else:

                outbam_muts.close()
                os.remove(tmpoutbamname)
                if os.path.exists(tmpoutbamname + '.bai'):
                    os.remove(tmpoutbamname + '.bai')
                logger.warning("%s dropped for outcover/incover < %s" %
                               (hapstr, str(args.coverdiff)))
                return None

        outbam_muts.close()
        bamfile.close()
        bammate.close()
        log.close()

        return tmpbams

    except Exception, e:
        sys.stderr.write("*" * 60 + "\nERROR\t" + now() +
                         "\tencountered error in mutation spikein: " +
                         str(mutid_list) + "\n")
        traceback.print_exc(file=sys.stdout)
        sys.stderr.write("*" * 60 + "\n")
        if os.path.exists(tmpoutbamname):
            os.remove(tmpoutbamname)
        if os.path.exists(tmpoutbamname + '.bai'):
            os.remove(tmpoutbamname + '.bai')
        return None
Ejemplo n.º 43
0
    for i in range(len(dlugosci)):
        L = set()
        for s in slowa[dlugosci[i]]:
            for k in range(1, 32):
                slowo2 = ceasar(s, k)

                if (slowo2 in slowa[dlugosci[i]]):
                    L.add(s)
        if (L != set()):
            return L
    return None


plik = open("popularne.txt")

slowa = dd(set)
dl = set()

for w in plik:
    w.split()
    w = w.strip()
    if (zle_slowo(w) == False):
        continue
    else:
        slowa[len(w)].add(w)
        dl.add(len(w))
dlugosci = list(dl)
dlugosci.sort(reverse=True)

print(najdlcesarskie(dlugosci, slowa))
Ejemplo n.º 44
0
def makemut(args, chrom, start, end, vaf, ins, avoid, alignopts):
    ''' is ins is a sequence, it will is inserted at start, otherwise delete from start to end'''

    if args.seed is not None: random.seed(int(args.seed) + int(start))

    mutid = chrom + '_' + str(start) + '_' + str(end) + '_' + str(vaf)
    if ins is None:
        mutid += ':DEL'
    else:
        mutid += ':INS:' + ins

    try:
        bamfile = pysam.Samfile(args.bamFileName, 'rb')
        bammate = pysam.Samfile(args.bamFileName, 'rb') # use for mates to avoid iterator problems
        reffile = pysam.Fastafile(args.refFasta)
        tmpbams = []

        is_insertion = ins is not None
        is_deletion  = ins is None

        snvfrac = float(args.snvfrac)

        mutstr = get_mutstr(chrom, start, end, ins, reffile)

        del_ln = 0
        if is_deletion:
            del_ln = end-start

        mutpos = start
        mutpos_list = [start]

        # optional CNV file
        cnv = None
        if (args.cnvfile):
            cnv = pysam.Tabixfile(args.cnvfile, 'r')

        log = open('addindel_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + "." + "_".join((chrom,str(start),str(end))) + ".log",'w')

        tmpoutbamname = args.tmpdir + "/" + mutid + ".tmpbam." + str(uuid4()) + ".bam"
        logger.info("%s creating tmp bam: %s" % (mutid ,tmpoutbamname))
        outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile)

        mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate(args, log, bamfile, bammate, chrom, mutpos, mutpos+del_ln+1, mutpos_list, avoid=avoid, mutid_list=[mutid], is_insertion=is_insertion, is_deletion=is_deletion, ins_seq=ins, reffile=reffile, indel_start=start, indel_end=end)

        if mutfail:
            outbam_muts.close()
            os.remove(tmpoutbamname)
            return None

        # pick reads to change
        readlist = []
        for extqname,read in outreads.iteritems():
            if read.seq != mutreads[extqname]:
                readlist.append(extqname)

        logger.info("%s len(readlist): %d" % (mutid, len(readlist)))
        readlist.sort()
        random.shuffle(readlist)

        if len(readlist) < int(args.mindepth):
            logger.warning("%s skipped, too few reads in region: %d" % (mutid, str(len(readlist))))
            outbam_muts.close()
            os.remove(tmpoutbamname)
            return None

        if vaf is None:
            vaf = float(args.mutfrac) # default minor allele freq if not otherwise specified
        if cnv: # cnv file is present
            if chrom in cnv.contigs:
                for cnregion in cnv.fetch(chrom,start,end):
                    cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN
                    logger.info(mutid + "\t" + ' '.join(("copy number in snp region:",chrom,str(start),str(end),"=",str(cn))))
                    if float(cn) > 0.0:
                        vaf = 1.0/float(cn)
                    else:
                        vaf = 0.0
                    logger.info("%s adjusted VAF: %f" % (mutid, vaf))
        else:
            logger.info("%s selected VAF: %f" % (mutid, vaf))

        lastread = int(len(readlist)*vaf)

        # pick at least args.minmutreads if possible
        if lastread < int(args.minmutreads):
            if len(readlist) > int(args.minmutreads):
                lastread = int(args.minmutreads)
                logger.warning("%s forced %d reads" % (mutid, lastread))
            else:
                logger.warning("%s dropped site with fewer reads than --minmutreads" % mutid)
                os.remove(tmpoutbamname)
                return None

        readtrack = dd(list)

        for readname in readlist:
            orig_name, readpos, pairend = readname.split(',')
            readtrack[orig_name].append('%s,%s' % (readpos, pairend))

        usedreads = 0
        newreadlist = []

        for orig_name in readtrack:
            for read_instance in readtrack[orig_name]:
                newreadlist.append(orig_name + ',' + read_instance)
                usedreads += 1

            if usedreads >= lastread:
                break

        readlist = newreadlist

        logger.info("%s picked: %d reads" % (mutid, len(readlist)))

        wrote = 0
        nmut = 0
        mut_out = {}
        # change reads from .bam to mutated sequences
        for extqname,read in outreads.iteritems():
            if read.seq != mutreads[extqname]:
                if not args.nomut and extqname in readlist:
                    qual = read.qual # changing seq resets qual (see pysam API docs)
                    read.seq = mutreads[extqname] # make mutation
                    read.qual = qual
                    nmut += 1
            if not hasSNP or args.force:
                wrote += 1
                mut_out[extqname] = read

        muts_written = {}

        for extqname in mut_out:
            if extqname not in muts_written:
                outbam_muts.write(mut_out[extqname])
                muts_written[extqname] = True

                if mutmates[extqname] is not None:
                    # is mate also in mutated list?
                    mate_read = mutmates[extqname]

                    pairname = 'F' # read is first in pair
                    if mate_read.is_read2:
                        pairname = 'S' # read is second in pair
                    if not mate_read.is_paired:
                        pairname = 'U' # read is unpaired

                    mateqname = ','.join((mate_read.qname,str(mate_read.pos),pairname))

                    if mateqname in mut_out:
                        # yes: output mutated mate
                        outbam_muts.write(mut_out[mateqname])
                        muts_written[mateqname] = True

                    else:
                        # no: output original mate
                        outbam_muts.write(mate_read)

        logger.info("%s wrote: %d, mutated: %d" % (mutid,wrote,nmut))

        if not hasSNP or args.force:
            outbam_muts.close()
            aligners.remap_bam(args.aligner, tmpoutbamname, args.refFasta, alignopts, mutid=mutid, paired=(not args.single), picardjar=args.picardjar, insane=args.insane)

            outbam_muts = pysam.Samfile(tmpoutbamname,'rb')
            coverwindow = 1
            incover  = countReadCoverage(bamfile,chrom,mutpos-coverwindow,mutpos+del_ln+coverwindow)
            outcover = countReadCoverage(outbam_muts,chrom,mutpos-coverwindow,mutpos+del_ln+coverwindow)

            avgincover  = float(sum(incover))/float(len(incover)) 
            avgoutcover = float(sum(outcover))/float(len(outcover))
            spikein_frac = 0.0
            if wrote > 0:
                spikein_frac = float(nmut)/float(wrote)

            # qc cutoff for final snv depth 
            if (avgoutcover > 0 and avgincover > 0 and avgoutcover/avgincover >= float(args.coverdiff)) or args.force:
                tmpbams.append(tmpoutbamname)
                indelstr = ''
                if is_insertion:
                    indelstr = ':'.join(('INS', chrom, str(start), ins))
                else:
                    indelstr = ':'.join(('DEL', chrom, str(start), str(end)))

                snvstr = chrom + ":" + str(start) + "-" + str(end) + " (VAF=" + str(vaf) + ")"
                log.write("\t".join(("indel",indelstr,str(mutpos),mutstr,str(avgincover),str(avgoutcover),str(spikein_frac),str(maxfrac)))+"\n")
            else:
                outbam_muts.close()
                os.remove(tmpoutbamname)
                if os.path.exists(tmpoutbamname + '.bai'):
                    os.remove(tmpoutbamname + '.bai')
                    
                logger.warning("%s dropped for outcover/incover < %s" % (mutid, str(args.coverdiff)))
                return None

        outbam_muts.close()
        bamfile.close()
        bammate.close()
        log.close() 

        return sorted(tmpbams)
        
    except Exception, e:
        sys.stderr.write("*"*60 + "\nencountered error in mutation spikein: " + mutid + "\n")
        traceback.print_exc(file=sys.stdout)
        sys.stderr.write("*"*60 + "\n")
        if os.path.exists(tmpoutbamname):
            os.remove(tmpoutbamname)
        if os.path.exists(tmpoutbamname + '.bai'):
            os.remove(tmpoutbamname + '.bai')
        return None
 def record(key, key2, d):
     if key in d:
         d[key][key2] += 1
     else:
         d[key] = dd(int)
         d[key2] = 1
Ejemplo n.º 46
0
def comp_term_freq(entities):
    term_freq = dd(int)
    for ent in entities:
        for token in jieba.lcut(ent):
            term_freq[token] += 1
    return term_freq
Ejemplo n.º 47
0
        for file in file_list:
            try:
                comment_list = pickle.load(open(file, "rb"))
                for j in comment_list:
                    yield (trim(j["raw_message"]).split())
            except:
                continue


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
comment_pattern = re.compile("^comments_(1000_)?2016.*")
prev_file = None
for site in site_list:
    site_d = site + "/"
    files = dd(list)
    for i in os.listdir(direc_prefix + site_d):
        if comment_pattern.match(i):
            files[find_month(int(
                day_finder.search(i).group(1)))].append(direc_prefix + site_d +
                                                        i)
    for month in months:
        file_list = files[month]
        s = sentences(file_list)
        if prev_file is None:
            model = word2vec.Word2Vec(s, iter=10)
        else:
            model = gensim.models.Word2Vec.load(prev_file)
            model.train(s, iter=10, size=300, workers=10)
            prev_file = result_storage_direc + "{}2016{}.w2v".format(
                site, month)
Ejemplo n.º 48
0
            parent[child] = source
            dfs(child, source)


def precompute():
    dfs(0)  # Considering 0 as the root of the tree.
    for i in range(lg + 1):
        for j in range(n):
            if i == 0:
                table[i][j] = parent[j]
                continue
            table[i][j] = table[i - 1][table[i - 1][j]]


n, m = map(int, input().split())  # The number of nodes and number of edges
graph = dd(set)
for i in range(m):
    u, v = map(int, input().split())
    graph[u].add(v)
    graph[v].add(u)
lg = int(log(n, 2))
# To pre compute the parents of nodes at a distance of powers of two.
parent = dd(lambda: -1)
table = dd(lambda: dd(lambda: -1))
precompute()
node, ancestor = map(
    int,
    input().split())  # Enter the node and i-th ancestor we want to find
for i in range(lg + 1):
    if ancestor & 1:
        node = table[i][node]
Ejemplo n.º 49
0
    def make_dictionary(self, question_dir, vocab_file, freq_file):

        if os.path.exists(vocab_file) and os.path.exists(freq_file):
            print "loading vocabularies from " + vocab_file + " ..."
            vocabularies = map(lambda x: x.strip(),
                               open(vocab_file).readlines())
            word2freq = cPickle.load(open(freq_file))
        else:
            print "no " + vocab_file + " found, constructing the vocabulary list ..."

            fnames = []
            fnames += glob.glob(question_dir + "/test/*.question")
            fnames += glob.glob(question_dir + "/validation/*.question")
            fnames += glob.glob(question_dir + "/training/*.question")

            vocab_set = set()
            n = 0.
            word2freq = dd(int)
            for fname in fnames:

                fp = open(fname)
                fp.readline()
                fp.readline()
                document = fp.readline().split()
                fp.readline()
                query = fp.readline().split()
                fp.close()

                vocab_set |= set(document) | set(query)

                for word in document:
                    word2freq[word] += 1
                for word in query:
                    word2freq[word] += 1
                word2freq[SYMB_BEGIN] += 2
                word2freq[SYMB_END] += 2

                # show progress
                n += 1
                if n % 10000 == 0:
                    print '%3d%%' % int(100 * n / len(fnames))

            entities = set(e for e in vocab_set if e.startswith('@entity'))

            # @placehoder, @begin and @end are included in the vocabulary list
            tokens = vocab_set.difference(entities)
            tokens.add(SYMB_BEGIN)
            tokens.add(SYMB_END)

            vocabularies = list(entities) + list(tokens)

            print "writing vocabularies to " + vocab_file + " ..."
            vocab_fp = open(vocab_file, "w")
            vocab_fp.write('\n'.join(vocabularies))
            vocab_fp.close()

            freqs = [v for k, v in word2freq.iteritems()]
            freqs.sort()
            freq2index = {}
            bin_size = len(freqs) / BIN_NUM + 1
            for i, start in enumerate(range(0, len(freqs), bin_size)):
                end = min(start + bin_size, len(freqs))
                for j in range(start, end):
                    freq2index[freqs[j]] = i
            for k in word2freq.keys():
                word2freq[k] = freq2index[word2freq[k]]
            cPickle.dump(word2freq, open(freq_file, 'w'),
                         cPickle.HIGHEST_PROTOCOL)

        vocab_size = len(vocabularies)
        word_dictionary = dict(zip(vocabularies, range(vocab_size)))
        char_set = set([c for w in vocabularies for c in list(w)])
        char_set.add(' ')
        char_dictionary = dict(zip(list(char_set), range(len(char_set))))
        num_entities = len(
            [v for v in vocabularies if v.startswith('@entity')])
        print "vocab_size = %d" % vocab_size
        print "num characters = %d" % len(char_set)
        print "%d anonymoused entities" % num_entities
        print "%d other tokens (including @placeholder, %s and %s)" % (
            vocab_size - num_entities, SYMB_BEGIN, SYMB_END)

        return word_dictionary, char_dictionary, num_entities, word2freq
def count_tagsets(f,
                  delimiter="\t",
                  gold_analysis_in_the_first_position=False,
                  verbose=False):
    tagsets_dict = dd(int)
    root_and_analysis_cooccurence = {}
    surface_form_and_gold_analysis_cooccurence = {}

    ambiguity_scores = []

    def record(key, key2, d):
        if key in d:
            d[key][key2] += 1
        else:
            d[key] = dd(int)
            d[key2] = 1

    def record_root_and_analysis_cooccurence(root, analysis):
        record(root, analysis, root_and_analysis_cooccurence)

    def record_surface_form_and_gold_analysis_cooccurence(
            surface_form, analysis):
        record(surface_form, analysis,
               surface_form_and_gold_analysis_cooccurence)

    current_tagset = []
    current_roots = []
    analyses_idx = 0

    sentence_length = 0

    line = f.readline()
    # print line
    while line:
        line = line.strip()
        tokens = line.split(delimiter)
        # print tokens
        if len(tokens) == 3:
            if gold_analysis_in_the_first_position and analyses_idx == 0:
                record_surface_form_and_gold_analysis_cooccurence(
                    tokens[0], tokens[2])
            if analyses_idx == 0 and verbose:
                print("SURFACE FORM: %s" % tokens[0])
            current_tagset += [tokens[2]]
            current_roots += [tokens[1]]
            record_root_and_analysis_cooccurence(tokens[1], tokens[2])
            analyses_idx += 1

            if tokens[0] in ["<S>", "<DOC>", "<TITLE>", "</DOC>", "</TITLE>"]:
                sentence_length = 0
                current_product_of_ambiguities = 1
            elif tokens[0] == "</S>":
                ambiguity_score = current_product_of_ambiguities / float(
                    sentence_length) if sentence_length != 0 else 0.0
                ambiguity_scores.append([ambiguity_score, sentence_length])

        elif len(tokens) == 1:
            # tagset ended
            if len(current_tagset) > 0:
                tree_root = TreeNode(None, "ROOT")
                root_to_anonymized_root = {
                    root: ("X%d" % (idx + 1))
                    for idx, root in enumerate(sorted(set(current_roots)))
                }
                sorted_tagset = sorted(zip(
                    [root_to_anonymized_root[root] for root in current_roots],
                    current_tagset),
                                       key=lambda x: x[1])
                tagsets_dict["\n".join([x + y for x, y in sorted_tagset])] += 1

                current_product_of_ambiguities *= len(current_tagset)

                # trees
                for tagset_as_seq in [(x + y).split("+")
                                      for x, y in sorted_tagset]:
                    insert_into_tree(tree_root, tagset_as_seq)

                if verbose:
                    unanonymized_sorted_tagset = sorted(zip(
                        current_roots, current_tagset),
                                                        key=lambda x: x[1])
                    print(unanonymized_sorted_tagset)
                    print(sorted_tagset)
                    tr = LeftAligned()
                    print(tr(tree_root.print_children_recursive()))

                # clear
                current_tagset = []
                current_roots = []
                analyses_idx = 0

                sentence_length += 1

        elif len(tokens) == 2:
            # <DOC> or <TITLE> OR <S> OR </S>
            pass
        line = f.readline()
    return tagsets_dict, root_and_analysis_cooccurence, surface_form_and_gold_analysis_cooccurence, ambiguity_scores
Ejemplo n.º 51
0
    def __init__(self, options, data):
        self.options = options

        self.names = [f.name for f in data.features]
        self.sample_names = [s.name for s in data.samples]
        self.marker_stats = dd(lambda: dd(bool))
Ejemplo n.º 52
0
class LabelToSeed:

    seeds = dd(list)
    tweets = []
    labels = []

    def find_emo_cause(self, tweet_file, label_file):
        """
        Find the emotion causes
        :param tweet_file: tokenized tweet file
        :param label_file: labeled tweet file
        :return: void
        """
        with open(tweet_file, 'r') as t:
            for words in t:
                words = words.split()
                words = [w.split(":")[0] for w in words]
                self.tweets.append(words)
        with open(label_file, 'r') as l:
            for tags in l:
                tags = tags.split()
                self.labels.append(tags)
        for idx in range(len(self.labels)):
            self.extract_emo_cause(self.labels[idx], self.tweets[idx])

        self.pickle_seeds()

    def extract_emo_cause(self, labels, words):
        """
        Extract the emotions and causes
        :param labels: list of labels
        :param words: list of words
        :return: void
        """
        emo_flag = False
        cause_flag = False
        emo = []
        cause = []

        if len(labels) != len(words):
            raise Warning("Tweet tokens and labels do not match: ", Warning)
        else:
            for idx, label in enumerate(labels):

                if label == "I-E":
                    emo.append(words[idx])

                elif label == "I-C":
                    cause.append(words[idx])

                elif label == "O":
                    if emo_flag:
                        emo_flag = False
                        if cause:
                            self.seeds[" ".join(emo)].append(" ".join(cause))
                            emo = []
                            cause = []
                    elif cause_flag:
                        cause_flag = False
                        if emo:
                            self.seeds[" ".join(emo)].append(" ".join(cause))
                            emo = []
                            cause = []

                elif label == "B-E":
                    if cause_flag:
                        cause_flag = False
                        if emo:
                            self.seeds[" ".join(emo)].append(" ".join(cause))
                            emo = []
                            cause = []
                    emo.append(words[idx])
                    emo_flag = True

                elif label == "B-C":
                    if emo_flag:
                        emo_flag = False
                        if cause:
                            self.seeds[" ".join(emo)].append(" ".join(cause))
                            emo = []
                            cause = []
                    cause.append(words[idx])
                    cause_flag = True

                else:
                    raise Warning("Unknown label encountered: " + label,
                                  Warning)

            # When BIO tags occur at end of sentence
            if emo_flag or cause_flag:
                if emo and cause:
                    self.seeds[" ".join(emo)].append(" ".join(cause))

    def pickle_seeds(self):
        """
        Pickle the seed data
        :return: void
        """
        pickle.dump(self.seeds, open('../../lib/seeds/train_seeds.pkl', 'wb'))
Ejemplo n.º 53
0
    def logitR(self, target, target_names, Y, F, left_out=5):

        logit = LogisticRegression(penalty='l1')
        id_Ys, coef_key = [[y[i] for y in Y]
                           for i in range(len(target))], dd(lambda: dd(list))
        out = rage_classify_outputs.Classifier_Output(self.options)

        TRAIN_IDXS, CAND_IDXS, TEST_IDXS, s_key, p_key = [], [], [], {}, dd(
            list)
        sample_grades, gene_grades = {}, dd(lambda: dd(list))

        for i in range(len(target)):
            sample_grades[self.sample_names[i]] = dd(list)
            if type(target[i]) == int:
                s_key[i] = [
                    self.sample_names[i], True, target_names[target[i]]
                ]
                TRAIN_IDXS.append(i)
                CAND_IDXS.append(i)
            else:
                s_key[i] = [self.sample_names[i], False, target[i]]
                TEST_IDXS.append(i)

        while len(CAND_IDXS) > 0:
            if len(CAND_IDXS) > left_out:
                test_set = list([
                    x for x in np.random.choice(
                        CAND_IDXS, left_out, replace=False)
                ])
            else:
                test_set = CAND_IDXS
            train_set = [i for i in TRAIN_IDXS if i not in test_set]
            train_opts = [target_names[target[t]] for t in train_set]

            if len(list(set(train_opts))) != len(target_names):
                print 'uh'
                left_out -= 1
                continue

            Yj, Tj = [id_Ys[i]
                      for i in train_set], [target[i] for i in train_set]

            logit.fit(Yj, Tj)
            p_coeffs = logit.coef_
            preds = logit.predict(id_Ys)
            probs = logit.predict_proba(id_Ys)

            for i in range(len(p_coeffs)):
                for k in range(len(p_coeffs[i])):
                    coef_key[k][i].append(p_coeffs[i][k])

            for j in TEST_IDXS + test_set:
                pr = sorted(probs[j], reverse=True)
                p_key[j].append((target_names[preds[j]], pr[0] - pr[1]))
                sName, sBool, sTrue = s_key[j]
                sPredict = target_names[preds[j]]
                if len(p_key[j]) > 0:

                    for k, F_name in enumerate(F):
                        jVal = Y[k][j]

                        jMults = sorted({
                            Xk: jVal * Xv[-1]
                            for Xk, Xv in coef_key[k].items()
                        }.items(),
                                        key=lambda XX: XX[1])
                        for cIdx, cVals in enumerate(p_coeffs):
                            if cVals[k] != 0:
                                cMult = cVals[k] * jVal

                        if jMults[-1][1] > 0:
                            mTarget = target_names[jMults[-1][0]]
                            sample_grades[sName][mTarget].append(
                                [F_name, -1 * jMults[-1][1]])
                        if jMults[0][1] < 0:
                            mTarget = target_names[jMults[0][0]]
                            sample_grades[sName][mTarget].append(
                                [F_name, -1 * jMults[-1][1]])

                    sOutcome = (sTrue, sPredict)
                    if sTrue == sPredict: sBoolOutcome = 'YES'
                    else: sBoolOutcome = 'NO'

                    for t in target_names:

                        tPos = sorted(
                            [(f_val, f_name)
                             for f_name, f_val in sample_grades[sName][t]
                             if f_val > 0],
                            reverse=True)
                        tNeg = sorted(
                            [(f_val, f_name)
                             for f_name, f_val in sample_grades[sName][t]
                             if f_val < 0],
                            reverse=False)

                        if sTrue != t:
                            for fi, f_data in enumerate(tPos):
                                gene_grades[f_data[-1]]['FALSE_POS'].append(
                                    (fi + 1, sBoolOutcome, sOutcome))
                            for fi, f_data in enumerate(tNeg):
                                gene_grades[f_data[-1]]['TRUE_NEG'].append(
                                    (fi + 1, sBoolOutcome, sOutcome))
                        if sTrue == t:
                            for fi, f_data in enumerate(tPos):
                                gene_grades[f_data[-1]]['TRUE_POS'].append(
                                    (fi + 1, sBoolOutcome, sOutcome))
                            for fi, f_data in enumerate(tNeg):
                                gene_grades[f_data[-1]]['FALSE_NEG'].append(
                                    (fi + 1, sBoolOutcome, sOutcome))

                    out.add_score(s_key[j], p_key[j])
                    if j in CAND_IDXS: CAND_IDXS.remove(j)
                    elif j in TEST_IDXS: TEST_IDXS.remove(j)

        for g in gene_grades:
            GK = {}
            for k in gene_grades[g].keys():
                k_data = gene_grades[g][k]
                rank_all = [kd[0] for kd in k_data]
                rank_yes = [kd[0] for kd in k_data if kd[1] == 'YES']
                rank_no = [kd[0] for kd in k_data if kd[1] == 'NO']

                ram, raL = np.mean(rank_all), len(rank_all)
                rym, ryL = np.mean(rank_yes), len(rank_yes)
                rnm, rnL = np.mean(rank_no), len(rank_no)

                GK[k] = [ram, raL, rym, ryL, rnm, rnL]

            out.add_gene_grades(g, GK)

        for k, f_name in enumerate(F):
            out.add_coefs(f_name, coef_key[k], target_names)
Ejemplo n.º 54
0
import sys
from collections import defaultdict as dd
sys.setrecursionlimit(1000000)

d=dd(list)
mod=1000000007

n=int(input())
for _ in range(n-1):
  a,b=map(int,input().split())
  d[a].append(b)
  d[b].append(a)

d=dict(d)
f=[None]*(n+1)
g=[None]*(n+1)

def dfs(p,x):
  for i in d[x]:
    if i!=p:
      dfs(x,i)
  f[x],g[x]=1,1
  
  for i in d[x]:
    if i!=p:
      g[x]*=f[i]
      g[x]%=mod
      f[x]*=g[i]
      f[x]%=mod
  f[x]+=g[x]
  f[x]%=mod
Ejemplo n.º 55
0
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

 
plik=open("wyniki_wyborow.tsv")

dane=list()#miasto-ilemandatow-wyniki

for w in plik:
	w=list(w.split("\t"))
	dane.append(w[1:-1])
	
komitety=dane[0][2:]#nazwy partii
dane=dane[1:]
wyniki=dd(list)#miasto,alfa -wyniki[wynik,partia] 

for lista in dane:
	miasto=lista[0]
	for dzielnik in range(1,int(lista[1])):
		alfa=0.1
		while(alfa<2):
			x=dzielnik**alfa
			for i in range(len(lista[2:])):
				partia=komitety[i]
				procent=lista[i+2]
				if(',' in procent):
					procent=procent.replace(',','.')
					procent=float(procent)
					wyniki[(miasto,alfa)].append([procent/x,partia])
					continue
Ejemplo n.º 56
0
    def logitU(self, target, target_names, Y, F, left_out=5):

        logit = LogisticRegression(penalty='l1')

        iterations = 5
        known_key, unk_key = {}, {}
        known_idxs,known_vals, unk_idxs,unk_vals,valid_names,valid_target,valid_key = [],[],[],[],[],[],{}
        for i in range(len(target)):
            if target_names[target[i]].split('~')[-1].upper()[0:3] == 'UNK':
                unk_idxs.append(i)
                unk_vals.append([y[i] for y in Y])
                unk_key[len(unk_idxs) -
                        1] = [self.sample_names[i], target_names[target[i]]]
            else:
                known_idxs.append(i)
                known_vals.append([y[i] for y in Y])
                known_key[len(known_idxs) -
                          1] = [self.sample_names[i], target_names[target[i]]]
                if target[i] not in valid_key:
                    valid_names.append(target_names[target[i]])
                    valid_key[target[i]] = len(valid_names) - 1
                valid_target.append(valid_key[target[i]])

        novel_key = dd(list)
        iter_key = dd(list)

        left_idxs = np.random.choice(range(len(valid_target)),
                                     left_out,
                                     replace=False)
        while True:
            left_vals, left_target = [known_vals[i] for i in left_idxs
                                      ], [valid_target[i] for i in left_idxs]

            iter_vals, iter_target = [
                v for i, v in enumerate(known_vals) if i not in left_idxs
            ], [v for i, v in enumerate(valid_target) if i not in left_idxs]

            if len(list(set(iter_target))) < len(valid_names):
                left_out -= 1
                continue

            logit.fit(iter_vals, iter_target)

            p_coeffs = logit.coef_
            pred_unk = logit.predict(unk_vals)
            prob_unk = logit.predict_proba(unk_vals)
            pred_val = logit.predict(left_vals)
            prob_val = logit.predict_proba(left_vals)

            for i in range(len(unk_idxs)):
                novel_key[i].append(
                    (valid_names[pred_unk[i]], prob_unk[i][pred_unk[i]]))

            for i, j in enumerate(left_idxs):
                iter_key[j].append(
                    (valid_names[pred_val[i]], prob_val[i][pred_val[i]]))

            left_cands = [
                i for i in range(len(valid_target)) if len(iter_key[i]) < 4
            ]

            if len(left_cands) == 0: break
            elif len(left_cands) <= left_out: left_idxs = left_cands
            else:
                left_idxs = np.random.choice(left_cands,
                                             left_out,
                                             replace=False)

        out = rage_classify_outputs.Classifier_Unknown_Output(self.options)

        for i, dubs in novel_key.items():
            votes = dd(float)
            name, orig_id = unk_key[i]
            for a, b in dubs:
                votes[a] += b
            scrs = sorted(votes.items(), key=lambda X: X[1], reverse=True)
            out.add_pred(name, orig_id, scrs[0][0],
                         scrs[0][1] / sum([sc[1] for sc in scrs]))

        for i, dubs in iter_key.items():
            votes = dd(float)
            name, orig_id = known_key[i]
            for a, b in dubs:
                votes[a] += b
            scrs = sorted(votes.items(), key=lambda X: X[1], reverse=True)
            out.add_pred(name, orig_id, scrs[0][0],
                         scrs[0][1] / sum([sc[1] for sc in scrs]))
Ejemplo n.º 57
0
def parse_gene_ref(ref_gene):
    ##some gene ids are not in txt form, provide ability to parse BED
    path, ext = os.path.splitext(ref_gene)
    if ext.lower() == '.bed':
        reader = BEDFile(ref_gene)
    else:
        reader = KnownGeneFile(ref_gene)
    gene_ref = dd(
        list
    )  #all of the genes in a chromosome in dictionary with keys = chromID and value = [list of genes]
    gene_info = {
    }  #all information about a each gene, keys = geneID and value = dictunary of all info about gene
    chrom_info = {
    }  #all of the chromosomes with info about gene stored in AVL tree, i.e keys - chromID and value = AVL(all genes arranged by startSyt)
    for ref_dict in reader:
        if ext.lower() == '.bed':
            ref_dict['txStart'] = ref_dict['chromStart']
            ref_dict['txEnd'] = ref_dict['chromEnd']


# determine intervals for promoter, gene, and downstream
        if ref_dict['strand'] == '+':  #if gene in 5' to 3' orientation
            promoter_coords = max(
                ref_dict['txStart'] - 1 - opts.upst_win, 0
            ), ref_dict['txStart'] - 1  #find the start and end of the promoter
            gene_coords = ref_dict['txStart'], ref_dict[
                'txEnd']  #find the start and end of gene
            #use these coordinates if we're trying to window around TSS
            window_coords = ref_dict['txStart'] + 1, ref_dict[
                'txStart'] + opts.dnst_win
            downstream_coords = ref_dict['txEnd'] + 1, ref_dict[
                'txEnd'] + 1 + opts.dnst_win
            ref_dict['promoter_coords'] = promoter_coords
            ref_dict['gene_coords'] = gene_coords
            ref_dict['window_coords'] = window_coords
            ref_dict['downstream_coords'] = downstream_coords
        else:
            promoter_coords = ref_dict['txEnd'] + 1, ref_dict[
                'txEnd'] + 1 + opts.upst_win  # +1 because we're using 1 based indexing
            gene_coords = ref_dict['txStart'], ref_dict['txEnd']
            window_coords = ref_dict['txEnd'] - opts.dnst_win, ref_dict[
                'txEnd']
            downstream_coords = ref_dict[
                'txStart'] - 1 - opts.dnst_win, ref_dict[
                    'txStart'] - 1  # -1 because we're using 1 based indexing
            ref_dict['promoter_coords'] = promoter_coords
            ref_dict['gene_coords'] = gene_coords
            ref_dict['window_coords'] = window_coords
            ref_dict['downstream_coords'] = downstream_coords

        gene_ref[ref_dict['chrom']].append(ref_dict)
        gene_info[ref_dict['name']] = ref_dict
        #putting relevant information about the gene into our AVL tree based on the chromosome
        if ref_dict['chrom'] not in chrom_info.keys():
            chrom_info[ref_dict['chrom']] = avl.AVLTree(
            )  #making a new instance of an AVL tree
            if ref_dict['strand'] == '+':
                chrom_info[ref_dict['chrom']].insert(
                    (ref_dict['promoter_coords'][0],
                     ref_dict['downstream_coords'][1], ref_dict['name']))
            else:
                chrom_info[ref_dict['chrom']].insert(
                    (ref_dict['downstream_coords'][0],
                     ref_dict['promoter_coords'][1], ref_dict['name']))
        else:
            if ref_dict['strand'] == '+':
                chrom_info[ref_dict['chrom']].insert(
                    (ref_dict['promoter_coords'][0],
                     ref_dict['downstream_coords'][1], ref_dict['name']))
            else:
                chrom_info[ref_dict['chrom']].insert(
                    (ref_dict['downstream_coords'][0],
                     ref_dict['promoter_coords'][1], ref_dict['name']))

    return gene_ref, gene_info, chrom_info
                type=str,
                help='Column name for gene id in the other file')
ap.add_argument('--other_disease',
                required=True,
                type=str,
                help='Column name for disease id in the other file')
ap.add_argument(
    '--pheno_series',
    required=True,
    type=str,
    help=
    'A json dictionary containing mappings between disease mim IDs ant their phenotypic series number'
)
args = ap.parse_args()

omim_dict = dd(set)
with open(args.omim) as omim_fh:
    header = omim_fh.readline().strip().split("\t")
    for line in omim_fh:
        lines = line.strip().split("\t")
        #Dictionary indexed by gene id, containing a set of disease identifiers as values
        omim_dict[lines[header.index(args.omim_gene)]].add(lines[header.index(
            args.omim_disease)])

with open(args.pheno_series, 'r') as ps_h:
    ps = json.load(ps_h)

with open(args.other) as other_fh:
    header = other_fh.readline().strip().split("\t")
    print("\t".join(header))
    for line in other_fh:
Ejemplo n.º 59
0
        peak_info[peak['name']] = peak
        if peak['chrom'] not in chrom_peaks.keys():
            chrom_peaks[peak['chrom']] = [(peak[start_field], peak['name'])]
        else:
            chrom_peaks[peak['chrom']].append(
                (peak[start_field], peak['name']))

    peaks_writer = DictWriter(peak_output,
                              output_fields,
                              delimiter='\t',
                              extrasaction='ignore',
                              lineterminator='\n')
    peaks_writer.writerow(dict([(k, k) for k in output_fields]))
    unique_genes = set()
    map_stats = dd(int)
    rowcount = 0

    interval = 1000
    if totalrows > 100000:
        interval = 10000

    print '\nParsing %d rows from peak file and will provide update every %d rows' % (
        totalrows, interval)

    peaks_without_genes = []
    genes_without_peaks = []
    #walk through the peaks in a chromosome
    for chrom in chrom_peaks:
        heapq.heapify(
            chrom_peaks[chrom])  #sort them based on order on the chromosome
Ejemplo n.º 60
0
# Print experiment configurations.
keys = list(args.__dict__.keys())
keys.sort()
strings = "=" * 64 + "\n" + "\n".join(
    [k + "=" + str(args.__dict__[k]) for k in keys]) + "\n" + "_" * 64
print(strings)
with open(os.path.join(result_dir, "log.txt"), "a") as f:
    f.write(strings + "\n")

# Load the training and test sets of Drug-Disease Relations dataset (DDR).
X_train, X_test, X_elmo_train, X_elmo_test, \
X_sparse_train, X_sparse_test, y_train, \
y_test, feat2idx, vocab = get_ddr_dataset(args)

# Record the ground truth and model predictions.
results = dd(dd)
results["X_test"] = X_test
results["y_test"] = y_test

# Train the model.
rnn = model(vocab=vocab,
            model_name=args.model_name,
            max_iter=args.max_iter,
            eta=args.eta,
            batch_size=args.batch_size,
            test_batch_size=args.test_batch_size,
            hid_dim=args.hid_dim,
            hid_hb_dim=args.hid_hb_dim,
            emb_dim=args.emb_dim,
            feat_dim=len(feat2idx),
            max_len=args.max_len,