コード例 #1
0
 def test_everseen(self):
     """ensure duplicate elements are ignored"""
     u = mi.unique_everseen('AAAABBBBCCDAABBB')
     self.assertEqual(
         ['A', 'B', 'C', 'D'],
         list(u)
     )
コード例 #2
0
def wordNetNER(document):
	plant_sns = (wn.synsets('plant', pos="n"))
	plant = plant_sns[1] #(botany) a living organism lacking the power of locomotion #hardcoded

	wordnet_names = []
	#wordnet_lemmatizer = WordNetLemmatizer   ##Lematizer doesn't work....
	for word in document:
		#word = wordnet_lemmatizer.lemmatize(word) ##Lematizer doesn't work...
		mySynsets = wn.synsets(word, pos="n")

		i = 0
		for i in range(0, 3):
			try:
				given_word = mySynsets[i] #tries first 3 synsets
				definition = (given_word.definition())
				p1 = re.compile('plant(s?)\s')
				p2 = re.compile('organism(s?)\s')
				p3 = re.compile('animal(s?)\s')
				match1 = p1.search(definition)
				match2 = p2.search(definition)
				match3 = p3.search(definition)

				if match1 or match2 or match3:  #if the given word has "plants" or "animals" in the def, check to see how similar it is to "plant"
					similarity_score = (given_word.path_similarity(plant)) #check similarity score
					if similarity_score >= 0.2:
						#print(similarity_score)
						#print ("The words: "+(str(given_word)) + "  has a sim score of:  " +str(similarity_score))
						wordnet_names.append(word)
						named_entities.append(word)
			#hypernym = given_word.hypernyms() #hypernym is list #synset 'organism' exists #can't search in the hypernyms....hmm...
				i += 1
			except IndexError:
				pass
	wordnet_ner = (list(unique_everseen(wordnet_names)))
	return wordnet_ner
コード例 #3
0
ファイル: skill_search.py プロジェクト: mrbhandari/jobsearch
def fetch_skill_api():
    
    list_of_files = ['pre_6.txt']
    
    #list_of_letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ']
    
    for i in list_of_files:
      master_skills_list = []
      i_path_join = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "static", "data", i)
      with open(i_path_join, 'r') as fp:
        contents = fp.readlines()
        #print contents
        for line in contents:
          print line
          master_skills_list.extend(autosuggest_api(line))
      
      print master_skills_list
      
      
      c = list(unique_everseen(master_skills_list))
      ioutput_path_join = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "static", "data", i + 'out')
      
      with open(ioutput_path_join, 'w') as fp:
        for item in c:
          fp.write("%s\n" % item)
コード例 #4
0
ファイル: __init__.py プロジェクト: rphilander/proteus
def interpret(atom_index, grammar, user_input):
    stripped_input = strip_punctuation(user_input)

    base_atoms_raw = atom_index.query(stripped_input)
    base_atoms = []
    for a in base_atoms_raw:
        base_atoms.append(a.clone_nonstopword())

    stop_words = filter(lambda a: a.get_stopword(), grammar.get_atoms())
    extra_atoms = []
    for stop_word in stop_words:
        if not stop_word in base_atoms:
            extra_atoms.append(stop_word)
    expanded_atoms = base_atoms + extra_atoms

    result = []
    for query_pattern in grammar.get_query_patterns():
        phrases = generate_phrases(grammar, expanded_atoms)
        queries = query_pattern.resolve(phrases)
        for query in queries:
            result.append(query)
    result = filter(lambda q: q.get_score() > 0, result)
    result = filter(lambda q: q.validate(base_atoms), result)
    result = list(unique_everseen(result, lambda q: q.get_english()))
    result.sort(key=lambda q: q.get_score())
    result.reverse()
    for q in result:
        q.set_base_atoms(base_atoms)
    return result
コード例 #5
0
ファイル: fuel.py プロジェクト: xwa9860/FIG
    def create_a_pb_unit_cell(self,
                              fpb_prop,
                              uc_name
                              ):
        '''
        fpb_prop: a tuple contains:
            fuel_temps: temperature list for unique pebbles in the unit cell
            a matrix of unique pebbles x n layers of fuel in a triso
            coating_temps: a list that contains temp for each of the non-fuel layers in triso, e.g. 4x5
            cgt: central graphite temperature
            sht: shell temperature
            burnups: a list of 14 burnups
        uc_name: unit cell name
        '''
        fuel_temps, coating_temps, cgt, sht, uc_name, burnups, pb_comp_dir = fpb_prop
        fpb_list = []
        unique_fpb_list = {}
        unique_burnups = list(unique_everseen(burnups))
        unique_burnup_nb = len(unique_burnups)
        assert fuel_temps.shape[0] == unique_burnup_nb, 'wrong dimension %s' %str(fuel_temps.shape)
        assert coating_temps.shape[0] == unique_burnup_nb, 'wrong dimension' 

        # create a list of unique pebbles
        for i, bu in enumerate(unique_burnups):
            pb_name = 'pb%s%d' % (uc_name, bu)
            unique_fpb_list[bu] = self.create_a_fuel_pebble(fuel_temps[bu-1, :], 
                                                            coating_temps[unique_burnups[i]-1, :],
                                                            cgt, sht,
                                                            pb_name,
                                                            unique_burnups[i], 
                                                            pb_comp_dir)
        # create a list of all the 14 fuel pebbles, some of them are exactly the same
        for bu in burnups:
            fpb_list.append(unique_fpb_list[bu])
        return fpb_list
コード例 #6
0
def kth_smallest_elem(node, k):
    ls = inorder_traversal(node)
    # unique_everseen removes duplicates from a list in O(N) time accorinding
    # to
    # http://stackoverflow.com/questions/480214/how-do-you-remove-duplicates-from-a-list-in-python-whilst-preserving-order
    result = list(unique_everseen(ls))
    return result[k - 1]
def ref_insert_line(line, form):
    """
    Inserts \ref{} functions into tex files by substituting for a regex.

    :param line: a line from a tex file, in the form of a string,
        which we want to process and insert references into
    :param form: a regular expression specification
        for a string to be replaced.
    :return: a string, wherein all of the strings specified by
        form are replaced by \ref{form}
    """
    # lineIterator = form.search(line)
    # searchAndSub = []
    # lineNew = line
     # while searchAndSub is not None:
    #     searchAndSub = form.search(line)
    #     lineNew = lineNew.replace()

    searchresults = form.findall(line)
    iterableStrings = list(unique_everseen(searchresults))
    lineNew = line
    for substring in iterableStrings:
        lineNew = lineNew.replace(substring, r'(\ref{' + substring[1:-1] + r'})')

    return lineNew
コード例 #8
0
ファイル: models.py プロジェクト: volgoweb/wt
    def get_tree(self, node=None, filtered_ids=[]):
        """
        node - головной объект, от которого строим дерево,
        если он не указан, то строим дерево из всех объектов.
        """
        def get_descendants(node):
            descendents = []
            children = node.get_children()
            children.filter(pk__in=filtered_ids)
            for n in children:
                n_descendents = get_descendants(n)
                n_descendents = [n for n in n_descendents if n.pk in filtered_ids]
                descendents += n_descendents
            return [node] + descendents

        if node:
            tree = get_descendants(node)
        else:
            tree = []
            lev1_pages = self.filter(level=1)
            if filtered_ids:
                lev1_pages = self.filter(pk__in=filtered_ids)
            for node in lev1_pages:
                tree += get_descendants(node)
        from  more_itertools import unique_everseen
        tree = list(unique_everseen(tree))
        return tree
コード例 #9
0
ファイル: stats.py プロジェクト: ecdavis15/PokemonChart
def main():
	names = []
	types = ['Grass', 'Poison', 'Fire', 'Dragon', 'Flying', 'Water', 'Bug', 'Normal', 'Electric', 'Ground', 'Fairy', 'Fighting', 'Psychic', 'Rock', 'Steel', 'Ice', 'Ghost', 'Dark']
	scrapeNames(names, types)

	pokemon = {
		'images': {},
		'heights': {},
		'weights': {},
		'powers': {},
		'names': []
	}

	scrapePowers(pokemon, names)
	scrapeHeightsWeights(pokemon, names)

	names = list(unique_everseen(names))
	pokemon['names'] = names

	scrapeImages(pokemon, names)

	fileName = 'stats/stats.json'
	file = open(fileName, 'w')
	json.dump(pokemon, file, indent = 2)
	file.close()
コード例 #10
0
def _countEntities(cols):
	for i in range(len(cols)):
		entries = cols[i]["entries"]
		entityCount = 0
		multipleEnts = False
		for entry in entries:
			entrySoup = BeautifulSoup(entry, "lxml")
			links = entrySoup.findAll("a")
			linksHref = [aTag["href"] for aTag in links]
			linksHref = list(unique_everseen(linksHref))
			linksCount = len(links)
			for link in links:
				# Image werden nicht gezählt
				if link.find("img") != None:
					linksCount -= 1
				# Ebenso dürfen es nur Wikipedia-interne Links sein
				elif link["href"][0:5] != "/wiki":
					linksCount -= 1
				# Manche Tabellen setzen doppelte Verlinkunden (e.g. TableID = 513, List of Olympic medalists in basketball)
				elif link["href"] in linksHref:
					linksHref.remove(link["href"]) # Beim ersten Mal löschen
				elif link["href"] not in linksHref:
					linksCount -= 1 # Bei jedem weiteren Mal, die Anzahl korrigieren
			if linksCount > 0:
				entityCount += 1
				if linksCount > 1:
					multipleEnts = True

		# Bewertung: Maximal 50 Punkte möglich (100% sind Entitäten)
		cols[i]["rating"] = int(math.floor(MAX_ENTITIES_POINTS * (entityCount / len(entries))))
		cols[i]["entityCount"] = entityCount
		cols[i]["multipleEntities"] = multipleEnts
コード例 #11
0
ファイル: aggregator.py プロジェクト: max-k/flask-multi-redis
 def keys(self, pattern):
     """Aggregated keys method."""
     def _keys(node, pattern):
         for result in node.keys(pattern):
             self._output_queue.put(result)
     results = self._runner(_keys, pattern)
     # return list(OrderedDict.fromkeys(results))
     return sorted(list(unique_everseen(results)))
コード例 #12
0
ファイル: Sudoku.py プロジェクト: marcoalbuquerque-sfu/CS486
	def clash_table(self,i,j):
		units = [];
		units.extend(self.row(i));
		units.extend(self.col(j));
		units.extend(self.box(which_box(i,j)));
		units = list(unique_everseen(units));
		values = list(chain(*[ list(unit.available_values) for unit in units ]));
		return(table(values));
コード例 #13
0
ファイル: beta_nmf_class.py プロジェクト: mikimaus78/groupNMF
 def __init__(self, data=np.asarray([[0, 0]]), cls_label=np.asarray([0]),
              ses_label=np.asarray([0]), buff_size=BUFF_SIZE,
              n_components=(K_CLS, K_SES, K_RES), beta=BETA,
              NMF_updates='beta', n_iter=N_ITER, lambdas=[0, 0, 0],
              normalize=False, fixed_factors=None, verbose=0,
              dist_mode='segment',Wn=None):
     self.data_shape = data.shape
     self.buff_size = np.min((buff_size, data.shape[0]))
     self.n_components = np.asarray(n_components, dtype='int32')
     self.beta = theano.shared(np.asarray(beta, theano.config.floatX),
                               name="beta")
     self.verbose = verbose
     self.normalize = normalize
     self.lambdas = np.asarray(lambdas, dtype=theano.config.floatX)
     self.n_iter = n_iter
     self.NMF_updates = NMF_updates
     self.iters = {}
     self.scores = []
     self.dist_mode = dist_mode
     if fixed_factors is None:
         fixed_factors = []
     self.fixed_factors = fixed_factors
     fact_ = np.asarray([base.nnrandn((self.data_shape[1],
                                       np.sum(self.n_components)))
                         for i in more_itertools.unique_everseen(itertools.izip(cls_label,
                                                                                ses_label))])
     self.W = theano.shared(fact_.astype(theano.config.floatX), name="W",
                            borrow=True, allow_downcast=True)
     fact_ = np.asarray(base.nnrandn((self.data_shape[0],
                                      np.sum(self.n_components))))
     self.H = theano.shared(fact_.astype(theano.config.floatX), name="H",
                            borrow=True, allow_downcast=True)
     self.factors_ = [self.H, self.W]
     if Wn is not None:
         self.Wn = Wn
     self.X_buff = theano.shared(np.zeros((self.buff_size,
                                           self.data_shape[1])).astype(theano.config.floatX),
                                 name="X_buff")
     if (self.NMF_updates == 'groupNMF') & (self.dist_mode == 'iter'):
         self.cls_sums = theano.shared(np.zeros((np.max(cls_label)+1,
                                                self.data_shape[1],
                                                self.n_components[0])
                                                ).astype(theano.config.floatX),
                                       name="cls_sums",
                                       borrow=True,
                                       allow_downcast=True)
         self.ses_sums = theano.shared(np.zeros((np.max(ses_label)+1,
                                                self.data_shape[1],
                                                self.n_components[1])
                                                ).astype(theano.config.floatX),
                                       name="ses_sums",
                                       borrow=True,
                                       allow_downcast=True)
         self.get_sum_function()
     self.get_updates_functions()
     self.get_norm_function()
     self.get_div_function()
コード例 #14
0
def getLessonList(courselink, quality):
    global session
    coursehtml = (session.get(courselink)).text

    lessonLinkRegex = re.compile('https?://www.cybrary.it/video/\w+(?:-[\w]+)*/')
    matchedLessonLink = list(unique_everseen(lessonLinkRegex.findall(coursehtml)))

    for link in matchedLessonLink:
        print "Downloading "+link
        downloadVideos(getVideoLink(link), quality)
コード例 #15
0
ファイル: base.py プロジェクト: rserizel/groupNMF
def reorder_cls_ses(data, cls, ses, with_index=False):
    """reorder the data such that there is only
    one continuous bloc for each pair class/session

    Parameters
    ----------
    data : array
        the data
    cls : array
        the class labels for the data
    ses : array
        the session label for the data
    with_index : Boolean (default False)
        if True, the function returns the reordered indexes together
        with data and labels

    Returns
    -------
    data : array with the same shape as data
        reordered data
    cls : array with the same shape as cls
        reordered class labels
    ses : array with the same shape as ses
        reordered session labels
    ind : array with the same shape as data.shape[1]
        reordered indexes (only if with_index==True)
    """

    data_ordered = np.zeros((data.shape))
    cls_ordered = np.zeros((cls.shape))
    ses_ordered = np.zeros((ses.shape))
    if with_index:
        index = np.arange((data.shape[1],))
        index_ordered = np.zeros((index.shape))
    data_fill = 0
    for i in more_itertools.unique_everseen(itertools.izip(cls, ses)):
        ind = np.where((cls == i[0]) & (ses == i[1]))[0]
        bloc_length = data[(cls == i[0]) & (ses == i[1]), :].shape[0]
        data_ordered[data_fill:data_fill+bloc_length, ] = data[ind, :]
        cls_ordered[data_fill:data_fill+bloc_length] = cls[ind]
        ses_ordered[data_fill:data_fill+bloc_length] = ses[ind]
        if with_index:
            index_ordered[data_fill:data_fill+bloc_length] = index[ind]
        data_fill += bloc_length
    if with_index:
        return {
            'data': data_ordered,
            'cls': cls_ordered,
            'ses': ses_ordered,
            'ind': index_ordered}
    else:
        return {
            'data': data_ordered,
            'cls': cls_ordered,
            'ses': ses_ordered}
コード例 #16
0
ファイル: getData.py プロジェクト: stunax/Ws
def onlymonths(wordfile,months):
    if len(wordfile) == months:
        return map(lambda x:x.split("\n")[0],wordfile)
    splt = map(lambda x: x.split(","),wordfile)
    startdate = map(lambda x: x[0],splt)
    times = map(lambda x: int(x[1]),splt)
    times = mergedate(startdate,times)
    startdate = list(unique_everseen(startdate))
    wordfile = map(lambda x:x[0]+ "," + str(x[1]),zip(startdate,times))


    return wordfile
コード例 #17
0
ファイル: init.py プロジェクト: pebble/flotilla
def bootstrap(region, environment, domain, instance_type, coreos_channel,
              coreos_version, available, flotilla_container):
    coreos = CoreOsAmiIndex()
    cloudformation = FlotillaCloudFormation(environment, domain, coreos)
    region_meta = RegionMetadata(environment)
    regions = [r for r in unique_everseen(region)]
    cloudformation.tables(regions)
    region_params = region_meta.store_regions(regions, available, instance_type,
                                              coreos_channel, coreos_version,
                                              flotilla_container)
    cloudformation.schedulers(region_params)
    logger.info('Bootstrap complete')
コード例 #18
0
def group_runs(data_dict):
    """Find unique model/physics groups"""

    all_info = data_dict.keys()

    model_physics_list = []
    for key, group in groupby(all_info, lambda x: x[0:2]):
        model_physics_list.append(key)

    family_list = list(unique_everseen(model_physics_list))

    return family_list
コード例 #19
0
def matchOrganisms2(bigrams):
	latin_names = []
	for grams in bigrams:
		(w1, w2) = grams.split(" ")
		p1 = re.compile('[a-z]{2,}[b-df-hj-np-tv-z]{1,}(i$|is$|ia$|ae$|um$|us$|es$|arum$)') #wont pick up ones that match in 'a'; too many false positives
		match1 = p1.search(w1)
		p2 = re.compile('[a-z]{2,}[b-df-hj-np-tv-z]{1,}(a$|i$|is$|ia$|ae$|um$|us$|es$|arum$)') #picks up the word 'genes'
		match2 = p2.search(w2)
		if match1 and match2:
			latin_names.append(grams)
			named_entities.append(grams)
	return (list(unique_everseen(latin_names)))
コード例 #20
0
def filters_mapping_content(exact_results: List, similarity_results: List, threshold: float) -> Tuple:
    """Parses compiled mapping results, when results exist, to determine a final aggregated mapping result.

    Args:
        exact_results: A nested list containing 3 sub-lists where sub-list[0] contains exact match uris, sub-list[1]
            contains exact match labels, and sub-list[2] contains exact match evidence.
        similarity_results: A nested list containing 3 sub-lists where sub-list[0] contains similarity uris,
            sub-list[1] contains similarity labels, and sub-list[2] contains similarity evidence.
        threshold: A float that specifies a cut-off for filtering cosine similarity results.

    Returns:
        A tuple of lists containing mapping results for a given row. The first tuple contains exact mapping results
            and the second contains similarity results. Both lists contains 3 items: uris, labels, and evidence.
    """

    exact_uri, exact_label, exact_evid = exact_results
    sim_uri, sim_label, sim_evid = similarity_results
    exact_result: Optional[List[Any]] = [None, None, None]
    sim_result: Optional[List[Any]] = [None, None, None]

    # format results
    if exact_uri:
        exact_result = [list(unique_everseen(exact_uri)),
                        list(unique_everseen(exact_label)),
                        ' | '.join(exact_evid)]
    if sim_uri:
        if any(x for x in sim_evid[0].split(' | ') if float(x.split('_')[-1]) == 1.0):
            evid_list = sim_evid[0].split(' | ')
            sim_keep = [evid_list.index(x) for x in evid_list if float(x.split('_')[-1]) == 1.0]
            uris, labels = [sim_uri[x] for x in sim_keep], [sim_label[x] for x in sim_keep]
            sim_result = [uris, labels, ' | '.join([evid_list[x] for x in sim_keep])]
        elif any(x for x in sim_evid[0].split(' | ') if float(x.split('_')[-1]) >= threshold):
            evid_list = sim_evid[0].split(' | ')
            sim_keep = [evid_list.index(x) for x in evid_list if float(x.split('_')[-1]) >= threshold]
            uris, labels = [sim_uri[x] for x in sim_keep], [sim_label[x] for x in sim_keep]
            sim_result = [uris, labels, ' | '.join([evid_list[x] for x in sim_keep])]
        else:
            sim_result = [sim_uri, sim_label, ' | '.join(sim_evid)]

    return exact_result, sim_result
コード例 #21
0
def evidence_writer(evidences, sentence_id, data_source, resource_v,
                    rule_predicates, rule_type):
    item_set = OrderedSet()
    entity_set = []
    # print rule_predicates
    for evidence in evidences:
        # print evidence[1]
        if evidence[1] in rule_predicates:
            if evidence[0] == resource_v[0] and evidence[2] == resource_v[
                    1] and evidence[1] == data_source:
                pass
            elif evidence[0] == resource_v[1] and evidence[2] == resource_v[
                    0] and evidence[1] in ["keyPerson", "capital"]:
                pass
            else:
                try:
                    if '"' not in evidence[0] and '"' not in evidence[2]:
                        if ':' not in evidence[0] and ':' not in evidence[2]:
                            if '#' not in evidence[0] and '#' not in evidence[
                                    2]:
                                if '&' not in evidence[
                                        0] and '&' not in evidence[2]:
                                    if '=' not in evidence[
                                            0] and '=' not in evidence[2]:
                                        if ' ' not in evidence[
                                                0] and ' ' not in evidence[2]:
                                            entity_1 = '"' + evidence[0] + '"'
                                            entity_2 = '"' + evidence[2] + '"'
                                            item_set.add(evidence[1] + '(' +
                                                         entity_1 + ',' +
                                                         entity_2 + ').')
                except:
                    pass
        else:
            pass
            # print "here"
    # print item_set
    with open(evidence_path + str(sentence_id) + '_.txt', 'wb') as csvfile:
        for i in item_set:
            if '*' not in i:
                try:
                    # print i
                    csvfile.write(i.encode('utf-8') + '\n')
                    # csvfile.write(i+'\n')
                except:
                    pass
    with open(evidence_path + str(sentence_id) + '_.txt', 'r') as f, \
            open(evidence_path + str(sentence_id) + '_unique.txt', 'wb') as out_file:
        out_file.writelines(unique_everseen(f))
    remove_file = evidence_path + str(sentence_id) + '_.txt'
    os.remove(remove_file)
    return item_set, entity_set
コード例 #22
0
def main(ifilename):

    file_name = ifilename
    file_name_wo_end = file_name[:-4]
    f = open(file_name)
    file_content = f.read()

    m3 = re.findall('xmlUrl=\"(.+?)\"', file_content)
    m4 = re.findall('text=\"(.+?)\"', file_content)
    m4.pop(0)

    result = []
    for i in range(len(m4)):
        print(m4[i], m3[i])
        result.append((m4[i], m3[i]))

    rss_dict = {}
    for name, url in tqdm(result):
        try:
            r = requests.get(
                url,
                headers={
                    'User-Agent':
                    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0'
                },
                timeout=15)
        except:
            print("failed: " + url)

        rss_dict[name] = r.text
        tqdm.write('Downloaded ' + url + '\n')
    json_filename = file_name_wo_end + ".json"
    with open(json_filename, 'w') as outfile:
        json.dump(rss_dict, outfile)

    all_urls_3 = []

    for content in rss_dict.values():
        m = re.findall('\"(http\S+?\.(?:mp3|mp4))[\"\?]', content,
                       re.IGNORECASE)
        if m:
            for item in m:
                all_urls_3.append(item)

    txt_filename = file_name_wo_end + ".txt"
    f = open(txt_filename, 'w')

    unique_urls = list(unique_everseen(all_urls_3))
    for url in unique_urls:
        f.write(url)
        f.write("\n")
    f.close()
コード例 #23
0
ファイル: gotchi.py プロジェクト: kbooten/speakingegg
    def get_most_informative_features(self, n=50):
        # Determine the most relevant features, and display them.
        cpdist = self.classifier._feature_probdist

        Feats = []

        for (fname, fval) in self.classifier.most_informative_features(n+5):

            def labelprob(l):
                return cpdist[l, fname].prob(fval)

            labels = sorted(
                [l for l in self.classifier._labels if fval in cpdist[l, fname].samples()],
                key=labelprob,
            )
            if len(labels) == 1:
                continue
            l0 = labels[0]
            l1 = labels[-1]

            Feats.append((fname, fval, l1, l0))
        #return Feats
            

        def format_double_feat(feature):
            if feature.startswith("("):
                feature = re.findall(r"\(u?[\"\'](.+)[\"\'], u?[\"\'](.+)[\"\']\)",feature)[0]
                if feature[0]=="<START>":
                    return "<span class='innerFeature'>%s</span> at the beginning" % feature[1]
                elif feature[1]=="<END>":
                    return "<span class='innerFeature'>%s</span> at the end" % feature[0]
                else:
                    feature = "<span class='innerFeature'>%s</span> followed by <span class='innerFeature'>%s</span>" % (feature[0],feature[1])
                    return str(feature)
            return "<span class='innerFeature'>%s</span>" % feature


        def format_feat(fname,fval,l1,l0):
            if l1=="good":
                if fval==None:
                    return "do not contain %s" % (format_double_feat(fname))
                else:
                    return "contain %s" % (format_double_feat(fname))

            elif l1=="bad":
                if fval==None:
                    return "contains %s" % (format_double_feat(fname))
                else:
                    return "do not contain %s" % (format_double_feat(fname))

        formatted_features = [format_feat(*f) for f in Feats]
        return list(more_itertools.unique_everseen(formatted_features))[:20]
コード例 #24
0
def stim_select_callback(attr, old, new, kwargs=plot_kwargs):
    # Values from widget are not exact stimuli names
    stim = get_filename(META, stim_select.value)

    image_cds.data = get_img(stim).data
    print('got_here')

    # remove current gaze plot
    remove_glyphs(image_plot, GAZE_COLORS)

    x_dim = int(META[stim]['x_dim'])
    y_dim = int(META[stim]['y_dim'])
    station_count = META[stim]['station_count']
    city = META[stim]['txt_name']

    # Grab old title, set new text
    t = matrix_plot.title
    t.text = stim_select.value + ' - (' + str(station_count) + ' stations)'

    # Retaining other settings
    color = color_select.value
    metric = METRICS.get(metric_select.value)
    matrix_cds.data = get_matrix_cds(stim, USERS, DF, color, metric).data

    # Yields unique 'xname's, preserving order
    if PRESENTING:
        order = list(unique_everseen(matrix_cds.data['xname']))
        matrix_plot.x_range.factors = order
        matrix_plot.y_range.factors = list(reversed(order))

    image_plot.x_range.start = 0
    image_plot.y_range.start = 0
    image_plot.x_range.end = x_dim
    image_plot.y_range.end = y_dim

    plot_w = x_dim + kwargs['min_border_left'] + kwargs['min_border_right']
    plot_h = y_dim + kwargs['min_border_top'] + kwargs['min_border_bottom']

    X = [
        item for sublist in matrix_cds.data['MappedFixationPointX']
        for item in sublist
    ]
    Y = [
        item for sublist in matrix_cds.data['MappedFixationPointY']
        for item in sublist
    ]
    duration = [
        item for sublist in matrix_cds.data['FixationDuration']
        for item in sublist
    ]

    fixation_cds.data = get_fixation_points(X, Y, duration).data
コード例 #25
0
class JusteatSpider(scrapy.Spider):
    name = "justeat"
    allowed_domains = ["just-eat.co.uk"]

    # download the restuarant_urls.txt from justeat1 spider output
    with open("restuarant_urls.txt") as f:
        restuarant_urls = [x.strip('\n') for x in f.readlines()]
    restuarant_urls = list(unique_everseen(restuarant_urls))

    # we need to run this in stages, about 7000 restuarants at a time max.
    print("total number of restaurants:  " + str(len(restuarant_urls)))
    print("first record:  " + str(restuarant_urls[0]))
    print("8000th record:  " + str(restuarant_urls[7999]))
    print("last record:  " + str(restuarant_urls[len(restuarant_urls) - 1]))

    while True:
        to_do = input("Enter Stage 1 to X (max 6 for now)  ")
        if to_do == "1" or to_do == "2" or to_do == "3" or to_do == "4" or to_do == "5" or to_do == "6":
            break

    # build a list of urls to scrap
    start_urls = []
    startrange = 7000 * (int(to_do) - 1)
    endrange = 7000 * (int(to_do))
    if endrange > len(restuarant_urls):
        endrange = len(restuarant_urls)
    for a in range(startrange, endrange):
        start_urls.append("https://www.just-eat.co.uk" +
                          str(restuarant_urls[a]))

    def parse(self, response):
        for sel in response.xpath('//div[@class="restaurantParts"]'):
            item = JusteatItem()
            item['name'] = sel.xpath('//*[@itemprop="name"]/text()').extract()
            item['address'] = sel.xpath(
                '//*[@itemprop="address"]/span/text()').extract()
            item['cuisine'] = sel.xpath(
                './/div/p/span[@itemprop="servesCuisine"]/text()').extract()
            item['ratingvalue'] = sel.xpath(
                './/div/p[@itemprop="aggregateRating"]/meta[@itemprop="ratingValue"]/@content'
            ).extract()
            item['ratingcount'] = sel.xpath(
                './/div/p[@itemprop="aggregateRating"]/meta[@itemprop="ratingCount"]/@content'
            ).extract()
            item['ratingbest'] = sel.xpath(
                './/div/p[@itemprop="aggregateRating"]/meta[@itemprop="bestRating"]/@content'
            ).extract()
            item['ratingworst'] = sel.xpath(
                './/div/p[@itemprop="aggregateRating"]/meta[@itemprop="worstRating"]/@content'
            ).extract()
            item['name_id'] = sel.xpath('.//@data-restaurant-id').extract()
            yield item
コード例 #26
0
def polygon_2_vtx(starting_vertex, edges_to_visit):
    from more_itertools import unique_everseen

    if not edges_to_visit:
        return

    closed = False

    print("Edges to visit:", edges_to_visit)
    subpolygon = []

    found_vertex = starting_vertex

    while not closed:
        for index, edge in enumerate(edges_to_visit.copy()):
            visiting_vertex = found_vertex

            #if visiting_vertex not in set(edge) and index==len(edges_to_visit.copy()):
            # Tracer()()
            #print("Not found in list of edges")
            #closed=True
            #break
            if visiting_vertex not in set(edge):
                continue
            subpolygon.append(visiting_vertex)

            print("Visiting vertex", visiting_vertex)

            found_starting_vtx = False
            subpolygon.append(found_vertex)

            print(visiting_vertex, " in ", edge)

            for index in set(edge):
                if visiting_vertex != index:
                    found_vertex = index
                    print("Found vertex:", found_vertex)
                    subpolygon.append(found_vertex)

            print("Removing edge", edge)
            edges_to_visit.discard(edge)
            print(edges_to_visit)
            if found_vertex == starting_vertex:
                subpolygon = list(unique_everseen(subpolygon))
                print("Back to starting vertex")
                closed = True
                break

    if len(subpolygon) <= 3:
        return
    else:
        return subpolygon
コード例 #27
0
def get_level_order(bms_table: dict) -> List[str]:
    """
    表データからレベル順を得る。

    :param bms_table: 表データ
    :return: level_order
    """
    # ひとまずデータ部に出現するレベルを出現順に列挙
    level_order_from_data = list(unique_everseen([chart["level"] for chart in bms_table["data"]]))

    # ヘッダ部にlevel_orderの指定がなかったら、データ部から得たレベル順をそのまま使う (仕様)
    if "level_order" not in bms_table["header"]:
        return level_order_from_data

    # ヘッダ部にlevel_orderの指定があったらそれを使う (仕様)
    level_order = bms_table["header"]["level_order"]

    # のだが、データ部との不整合が起こらないように以下のような処理を行っておく (独自解釈)
    level_order = map(str, level_order)  # データ部に合わせてstrにしておく (なぜか仕様がUnion[str, int]である)
    level_order = list(unique_everseen(level_order))  # 重複があったら除去しておく
    omissions = [level for level in level_order if level not in level_order_from_data]  # 抜けがあったら……
    return level_order + omissions  # 後ろにつけておく
コード例 #28
0
ファイル: functions.py プロジェクト: JohnMops/Python
 def list_load_balancers_external_subnets(self):
     client = self.session.client('elbv2')
     lb_list_external = []
     for lb in client.describe_load_balancers()['LoadBalancers']:
         if lb['Scheme'] == 'internet-facing':
             lb_list_external.append(lb)
     if not lb_list_external:
         print(colored("[WARNING] No Public Load Balancers detected", "red"))
     sub_list = []
     for i in lb_list_external:
         for k in i['AvailabilityZones']:
             sub_list.append(k['SubnetId'])
     return list(unique_everseen(sub_list))
コード例 #29
0
    def working(self):
        """A list of the current tests with warning or errors. A
        device is working if the list is empty.

        This property returns, for the last test performed of each type,
        the one with the worst ``severity`` of them, or ``None`` if no
        test has been executed.
        """
        from ereuse_devicehub.resources.event.models import Test
        current_tests = unique_everseen(
            (e for e in reversed(self.events) if isinstance(e, Test)),
            key=attrgetter('type'))  # last test of each type
        return self._warning_events(current_tests)
コード例 #30
0
 def add_to_udl(self, new_udl):
     """
     Adds more dictionaries to the udl. Also sorts them and removes double
     entries. Doesn't add dictionaries with confirmed wrong urls or levels.
     """
     new_udl = [ud for ud in new_udl if ud["URL"] not in self.wrong_urls]
     if self.confirmed_level is not None:
         new_udl = [
             ud for ud in new_udl if ud["Level"] == self.confirmed_level
         ]
     self.udl = new_udl + self.udl
     self.udl = list(unique_everseen(self.udl))
     self.udl = sorted(self.udl, key=lambda k: k['Score'], reverse=True)
コード例 #31
0
    def getPods():
        Initconnection.Initconnection.loadConfig()
        apiV1 = client.CoreV1Api()
        Logging.Logging.log('Retrieving pods')
        ret = apiV1.list_pod_for_all_namespaces(watch=False)
        output = []
        for pod in ret.items:
            if pod is None:
                Logging.Logging.log('Pods were not found')
            else:
                output.append(pod.metadata.name)

        return list(unique_everseen(output))
コード例 #32
0
def weighted_cosine_match(a, b):
    """Cosine match w/ weights based on order """
    a_list = list(unique_everseen(a))
    b_list = list(unique_everseen(b))

    domain = list(unique_everseen(a_list + b_list))

    a_vec = [
        _weight_of(word, a_list, "descending") if word in a_list else 0
        for word in domain
    ]
    b_vec = [
        _weight_of(word, b_list, "ascending") if word in b_list else 0
        for word in domain
    ]

    product = sum(x * y for x, y in zip(a_vec, b_vec))

    a_mag = math.sqrt(sum(x * x for x in a_vec))
    b_mag = math.sqrt(sum(x * x for x in b_vec))

    return product / (a_mag * b_mag)
コード例 #33
0
 def parse1(self, a_pbed, input_file, type):
     if type == 's':
         str_list = []
         str_list.append(
             '%%---Pebble unit cell with position from input file\n' +
             'pbed %d %d "%s"\n' %
             (self.univ.id, a_pbed.coolant.gen.univ.id, input_file))
         str_list.append('%%---Coolant in the unit cell\n' +
                         a_pbed.coolant.generate_output())
         str_list.append('%%---Pebbles in the unit cell(pbed)\n')
         for pb in list(unique_everseen(a_pbed.pb_list)):
             str_list.append(pb.generate_output())
         return ''.join(str_list)
コード例 #34
0
    def get_groups_by_types(
            self,
            types: List[str]) -> Union[AlignmentGroup, List[AlignmentGroup]]:
        """Return AlignmentGroups by fragment type.

        :param types: list of types
        :return:
        """
        groups = self.groups_by_type()
        if isinstance(types, str):
            return groups[types]
        else:
            return list(unique_everseen(flatten([groups[t] for t in types])))
コード例 #35
0
def evaluate(predicts, real_labels):

    error = 1 - accuracy_score(predicts, real_labels) # Compute error
    accuracy_score_of_all = accuracy_score(predicts, real_labels)

    print(f"\nTotal accuracy of predictions: {accuracy_score_of_all}")

    cnf_mat = confusion_matrix(real_labels, predicts, list(unique_everseen(real_labels)))  # Create confusion matrix
    print(cnf_mat)

    df_cm = pd.DataFrame(cnf_mat, index=list(unique_everseen(real_labels)), columns=list(unique_everseen(real_labels)),)
    col_sum = df_cm.sum(axis=1)
    df_cm = df_cm.div(col_sum,axis = 0)
    plt.figure(figsize=(20,20))
    heatmap = sns.heatmap(df_cm, annot=True)
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=8)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=8)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.title("Confusion Matrix, Normalized")
    plt.show()
    return error,cnf_mat
コード例 #36
0
def names(vma):
    """ Obtain names of all coordinates defined in the V-Matrix.

        :param vma: V-Matrix
        :type vma: automol V-Matrix data structure
        :rtype: tuple(str)
    """

    name_mat = name_matrix(vma)
    _names = filter(lambda x: x is not None,
                    numpy.ravel(numpy.transpose(name_mat)))

    return tuple(more_itertools.unique_everseen(_names))
コード例 #37
0
    def getNodes():
        Initconnection.Initconnection.loadConfig()
        Logging.Logging.log('Retrieving Nodes')
        apiV1 = client.CoreV1Api()
        nodes = apiV1.list_node()
        endpoints = []
        for node in nodes.items:
            if node is None:
                Logging.Logging.log('No nodes found')
            else:
                endpoints.append(f'{node.metadata.name}')

        return list(unique_everseen(endpoints))
コード例 #38
0
def bootstrap(region, environment, domain, instance_type, coreos_channel,
              coreos_version, available, flotilla_container):
    coreos = CoreOsAmiIndex()
    cloudformation = FlotillaCloudFormation(environment, domain, coreos)
    region_meta = RegionMetadata(environment)
    regions = [r for r in unique_everseen(region)]
    cloudformation.tables(regions)
    region_params = region_meta.store_regions(regions, available,
                                              instance_type, coreos_channel,
                                              coreos_version,
                                              flotilla_container)
    cloudformation.schedulers(region_params)
    logger.info('Bootstrap complete')
コード例 #39
0
 def find_topics(self,test):
     topics = []
     tp_num = []
     for quest in test.sscquestions_set.all():
         tp_number = quest.topic_category
         try:
             tp_name = changeIndividualNames(tp_number,quest.section_category)
         except:
             tp_name = 'Topics'
         topics.append(tp_name)
     topics = list(unique_everseen(topics))
     num_questions = len(test.sscquestions_set.all())
     return topics,num_questions
コード例 #40
0
def step(context, modes):
    context.execute_steps(
        u"""then wait for element {element} identified by css_selector""".
        format(element=DETECTMODE))
    labels_list = context.browser.find_elements_by_css_selector(DETECTMODE)
    modes_list = modes.split(",")
    label_obtained_elements = []
    for element in labels_list:
        if element.text != "":
            label_obtained_elements.append(element.text)
    label_obtained_elements = list(unique_everseen(label_obtained_elements))
    assert modes_list == label_obtained_elements, "Error: the list of options is not the expected. Expected {expected}, obtained {obtained}".format(
        expected=modes_list, obtained=label_obtained_elements)
コード例 #41
0
    def getServices():
        Initconnection.Initconnection.loadConfig()
        Logging.Logging.log('Retrieving Services')
        apiV1 = client.CoreV1Api()
        services = apiV1.list_service_for_all_namespaces()
        output = []
        for service in services.items:
            if service is None:
                Logging.Logging.log('No services found')
            else:
                output.append(f'{service.metadata.name}')

        return list(unique_everseen(output))
コード例 #42
0
 def _filter_unique_results(results):
     return list(
         unique_everseen(
             results,
             key=lambda x: (
                 x["query"]["sequence_id"],
                 x["query"]["start"],
                 x["query"]["end"],
                 x["subject"]["sequence_id"],
                 x["subject"]["start"],
                 x["subject"]["end"],
             ),
         ))
コード例 #43
0
def df_to_mct(df):
    full_array = np.array(df)

    loadcases = np.array(df.columns.values)[1:]
    combinations = full_array[2:, 0]
    load_types = full_array[1, 1:]
    factors = full_array[2:, 1:]

    mct_string = '*LOADCOMB    ; Combinations\n; NAME=NAME, KIND, ACTIVE, bES, iTYPE, DESC, iSERV-TYPE, nLCOMTYPE, nSEISTYPE   ; line 1\n;      ANAL1, LCNAME1, FACT1, ...                                               ; from line 2'

    #Creates load combinations
    for row_index, row in enumerate(factors):
        combination = combinations[row_index]
        mct_string = mct_string + '\n' + '   NAME=' + str(
            combination) + ', GEN, ACTIVE, 0, 0, , 0, 0, 0\n'

        for factor_index, factor in enumerate(row):
            load_type = load_types[factor_index]
            loadcase = loadcases[factor_index]

            if factor_index != 0:
                mct_string = mct_string + ', '
            mct_string = mct_string + str(load_type) + ', ' + str(
                loadcase) + ', ' + str(factor)

    #Creates combination envelopes
    combination_envelopes = []
    for comb in combinations:
        combination_envelopes.append(comb.split('_')[0])
        combination_envelopes.append(comb.split('|')[0])

    combination_envelopes = list(unique_everseen(combination_envelopes))

    combination_groups = []
    for envelope in combination_envelopes:
        group = []
        for comb in combinations:
            if envelope in comb:
                group.append(comb)
        combination_groups.append(group)

    for group_index, group in enumerate(combination_groups):
        mct_string = mct_string + '\n' + '   NAME=' + str(
            combination_envelopes[group_index]
        ) + ', GEN, ACTIVE, 0, 1, , 0, 0, 0\n'
        for comb_index, comb in enumerate(group):
            if comb_index != 0:
                mct_string = mct_string + ', '
            mct_string = mct_string + 'CB' + ', ' + str(comb) + ', ' + '1.0'

    return mct_string
コード例 #44
0
ファイル: kg_utils.py プロジェクト: callahantiff/PheKnowLator
def gets_entity_ancestors(graph: Graph,
                          uris: List[Union[URIRef, str]],
                          rel: Union[URIRef, str] = RDFS.subClassOf,
                          cls_lst: Optional[List] = None) -> List:
    """A method that recursively searches an ontology hierarchy to pull all ancestor concepts for an input entity.

    Args:
        graph: An RDFLib graph object assumed to contain ontology data.
        uris: A list of at least one ontology RDFLib URIRef object or string.
        rel: A string or RDFLib URI object containing a predicate.
        cls_lst: A list of URIs representing the ancestor classes found for the input class_uris.

    Returns:
        An ordered (desc; root to leaf) list of ontology objects containing the input uris ancestor hierarchy. Example:
            input: [URIRef('http://purl.obolibrary.org/NCBITaxon_11157')]
            output: ['http://purl.obolibrary.org/NCBITaxon_10239', 'http://purl.obolibrary.org/NCBITaxon_2559587',
                'http://purl.obolibrary.org/NCBITaxon_2497569', 'http://purl.obolibrary.org/NCBITaxon_11157']
    """

    prop = rel if isinstance(rel, URIRef) else URIRef(rel)
    cls_lst = [] if cls_lst is None else cls_lst
    cls_lst = list(
        unique_everseen([
            x if isinstance(x, URIRef) else URIRef(obo + x) for x in cls_lst
        ]))
    uris = list(
        unique_everseen(
            [x if isinstance(x, URIRef) else URIRef(obo + x) for x in uris]))
    ancs = list(
        unique_everseen(
            [j for k in [graph.objects(x, prop) for x in uris] for j in k]))
    if len(ancs) == 0 or len(set(ancs).difference(set(cls_lst))) == 0:
        return list(unique_everseen([str(x) for x in cls_lst]))
    else:
        uris = [x for x in ancs if x not in cls_lst]
        for i in uris:
            cls_lst.insert(0, i)
        return gets_entity_ancestors(graph, uris, prop, cls_lst)
コード例 #45
0
def monte_carlo(forward_,reverse_):
	forward=[]
	reverse=[]
	fasta_sequences = SeqIO.parse(open(forward_),'fasta')
	for fasta in fasta_sequences:
		name, sequence = fasta.id, fasta.seq
		forward.append(str(sequence).replace('\n', ''))
	fasta_sequences = SeqIO.parse(open(reverse_),'fasta')
	for fasta in fasta_sequences:
		name, sequence = fasta.id, fasta.seq
		reverse.append(str(sequence).replace('\n', ''))
	seed=1000 # Iterations
	matches=[]
	count=0
	cytosine=0
	fh=open('forward_d/monte.carlo', 'w+')
	match = 2
	mismatch = -1
	scoring = swalign.NucleotideScoringMatrix(match, mismatch)
	sw = swalign.LocalAlignment(scoring)
	while seed != 0:
		sequence=forward[int(random.random()*(len(forward)-1))] # get a random sequence from [ 0, N ] contigs
		width= int(random.random()*(len(sequence)-1) *0.20+13) # Take lower 13+[0-20] % of sequence length
		first=int(random.random()*(len(sequence)-1-width)) 
		subsequence=sequence[first:first+width]
		for x in reverse:
			index=x.find(subsequence)
			if  index != -1 :
				count_=ccount(reverse,reverse.index(x))+index
				fh.write(str(count_)+"\t"+str(eval(str(count_+width)))+"\n")
				for i in range(0,width):
					matches.append(count_+i)
					count+=1
			else:
				count_=ccount(reverse,reverse.index(x))+index
				alignment = sw.align(x,subsequence)
				cytosine+=alignment.dump()
		seed-=1
	fh.close()
	forward_length=ccount(forward,"1",True)
	xlim=ccount(reverse,"1",True)
	plt.hlines(1,1,xlim)  # Draw a horizontal line
	plt.xlim(0,xlim)
	plt.ylim(0.5,1.5)
	matches=list(unique_everseen(matches))
	y = np.ones(np.shape(matches))   # Make all y values the same
	plt.plot(matches,y,'|',ms = 40)  # Plot a line at each location specified in `matches`
	plt.axis('off')
	plt.show()
	print "[",count,"/ ",forward_length,"]  hits, [",cytosine,"]"
コード例 #46
0
ファイル: Apriori.py プロジェクト: shyuwang/Apriori-Algorithm
def gen_Ck(k, L):
    flatten = [item for subtuple in L
               for item in subtuple]  #flattens all candidates
    uniqueflatten = list(
        unique_everseen(flatten))  #gets out duplicate candidates
    Ck = list(combinations(
        uniqueflatten,
        k))  #creates list of all possible combinations of k length
    for c in Ck:
        if has_infeq_subsets(k, L, c):
            Ck.remove(c)
        else:
            pass
    return Ck
コード例 #47
0
ファイル: voc_utils.py プロジェクト: zhangyuygss/WSL
def get_image_url_list(category, data_type=None):
    """
    For a given data type, returns a list of filenames.

    Args:
        category (string): Category name as a string (from list_image_sets())
        data_type (string, optional): "train" or "val"

    Returns:
        list of strings: list of all filenames for that particular category
    """
    df = _load_data(category, data_type=data_type)
    image_url_list = list(unique_everseen(list(img_dir + df['fname'])))
    return image_url_list
コード例 #48
0
def unique_mail(emails: Iterator[Email]) -> Iterator[Email]:
    # remove duplicates (from a file being
    # in multiple boxes and the 'default' inbox)
    # some formats won't have a message id,
    # but hopefully the date/subject creates a unique
    # key in that case
    yield from unique_everseen(
        emails,
        key=lambda m: (
            m.subject_json,
            m.message_id_json,
            m.dt,
        ),
    )
コード例 #49
0
def get_int_mapping(dataframe, column):
    """ Returns index, reverse_index, and list of unique items in a pandas datframe """

    # Convert series to list
    column_to_list = dataframe[column].tolist()

    # Find set of unique items and convert to a list
    unique_items_list = list(unique_everseen(column_to_list))

    # Create indexes for each item
    item_index = {item: idx for idx, item in enumerate(unique_items_list)}
    index_item = {idx: item for item, idx in item_index.items()}

    return item_index, index_item, unique_items_list
コード例 #50
0
def get_image_url_list(category, data_type=None):
    """
    For a given data type, returns a list of filenames.

    Args:
        category (string): Category name as a string (from list_image_sets())
        data_type (string, optional): "train" or "val"

    Returns:
        list of strings: list of all filenames for that particular category
    """
    df = _load_data(category, data_type=data_type)
    image_url_list = list(
        unique_everseen(list(img_dir + df['fname'])))
    return image_url_list
コード例 #51
0
ファイル: ingest.py プロジェクト: vm/nba
    def _get_header_add(table):
        """Finds and returns the header of a table.

        :param table: A basketball-reference stats table.
        :returns: Header of the table as a list of strings.
        """
        def replace_titles(title):
            return multiple_replace(
                title,
                {'%': 'P', '3': 'T', '+/-': 'PlusMinus'})
        titles = (
            replace_titles(str(th.get_text()))
            for th in table.find_all('th')
        )
        return list(unique_everseen(titles))
コード例 #52
0
 def _merged_column_names_from(self, dataset_list):
     elements = []
     for idx_dataset, dataset in enumerate(dataset_list):
         # require getting the code from the dataset object always
         code = self.__dataset_objects__()[idx_dataset].code
         for index, column_name in enumerate(dataset.column_names):
             # only include column names that are not filtered out
             # by specification of the column_indexes list
             if self._include_column(dataset, index):
                 # first index is the date, don't modify the date name
                 if index > 0:
                     elements.append(self._rename_columns(code, column_name))
                 else:
                     elements.append(column_name)
     return list(unique_everseen(elements))
コード例 #53
0
ファイル: beta_nmf_class.py プロジェクト: mikimaus78/groupNMF
 def check_segments_length(self, data, cls_label, ses_label):
     cls = []
     cls_ind = []
     for i in more_itertools.unique_everseen(itertools.izip(cls_label, ses_label)):
         cls.append(i)
         start = np.where((cls_label == i[0]) & (ses_label == i[1]))[0][0]
         stop = np.where((cls_label == i[0]) & (ses_label == i[1]))[0][-1]+1
         cls_ind.append([start, np.min([start+self.buff_size, stop])])
         if data[(cls_label == i[0]) & (ses_label == i[1]), :].shape[0] > self.buff_size:
             ind = np.where((cls_label == i[0]) & (ses_label == i[1]))[0]
             if self.verbose > 0:
                 print "segment {0} to {1} is too long (length={2}, buffer={3})"\
                       "\n please increase buffer size or segment will be truncated"\
                       .format(ind[0], ind[-1], ind[-1]-ind[0]+1, self.buff_size)
     return np.asarray(cls), np.asarray(cls_ind)
コード例 #54
0
ファイル: markov.py プロジェクト: anoadragon453/storytime
def writeStory():
	markovText = ""
	text = ""
	topic = sys.argv[1]
	post_limit = 5
	if sys.argv[2] == "twitter":
		# Get topic to search
		tweets = searchTweets(topic)

		# Parse Tweets
		for tweet in tweets:
			if len(tweet.strip()) > 5:
                            # Remove social crap from tweets (credit: https://stackoverflow.com/a/8377440)
                            stripped = lambda tweet: re.compile('\#').sub('', re.compile('RT @').sub('@', tweet, count=1))
                            text += ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",stripped(tweet)).split()) + '. '
                            text = re.sub("( s )","\'s ",text)
                            text = re.sub("( ll )","\'ll ",text)
                            text = re.sub("( t )","\'t ",text)
                            text = re.sub("( ve )","\'ve ",text)
                            text = re.sub("(https|http)"," ",text)
                            text = re.sub("( m )","\'m ",text)
                            text = re.sub("(y all)","y\'all",text)
                            text = re.sub("( amp )"," & ",text)
                            text = re.sub("( re )","\'r ",text)
                            text = re.sub("( d )","\'d ",text)
                            text = re.sub("(NoneNone)"," ",text)


		# Generate markov
		splitText = generateMarkov(text).rsplit(' ', 1)[0].split(".")
                for tweet in list(unique_everseen(splitText)):
                    markovText+=tweet+'. '

	elif sys.argv[2] == "reddit":
		r = praw.Reddit(user_agent='storytime script: amorgan.me/storytime')	
		r.login('storytime_bot', 'Jnw5v9pgbHW6qXXrNb24EJkGeE8KwZ', disable_warning=True)
		submissions = r.get_subreddit(topic).get_new(limit=10)

		# Get comments
		for x in xrange(0,post_limit):
			submission = next(submissions)
			#comments = praw.helpers.flatten_tree(submission.comments)
			# Parse comments
			print (submission)
			#for comment in submission.comments:
				#markovText+=comment.body
			#	print(comment.body)
	print (markovText)
コード例 #55
0
ファイル: pbed_gen.py プロジェクト: xwa9860/FIG
 def parse1(self, a_pbed, input_file, type):
     if type == 's':
         str_list = []
         str_list.append(
             '%%---Pebble unit cell with position from input file\n' +
             'pbed %d %d "%s"\n' %
             (self.univ.id, a_pbed.coolant.gen.univ.id,
              input_file))
         str_list.append(
             '%%---Coolant in the unit cell\n' +
             a_pbed.coolant.generate_output())
         str_list.append(
             '%%---Pebbles in the unit cell(pbed)\n')
         for pb in list(unique_everseen(a_pbed.pb_list)):
             str_list.append(pb.generate_output())
         return ''.join(str_list)
コード例 #56
0
	def get_num_concepts(self):
		"""Calculates the oncepts in the text
		"""

		# Inititialize empty list
		concepts = []
		pairs = self.word_pairs

		for pair in pairs:
			first = pair[0]
			second = pair[1]

			concepts.append(first)
			concepts.append(second)

		num_concepts = len(list(unique_everseen(concepts)))

		return num_concepts
コード例 #57
0
    def __init__(self):
        #init DB connection
        from Database_Connection import Database_Connection
        database_connection = Database_Connection()

        #get list of words from the database
        self.list_of_words = database_connection.get_entries_as_list()
        words_from_database = database_connection.get_entries_as_list()
        
        #remove quotes because it breaks SQLite
        for i in range(0, len(words_from_database)):
            words_from_database[i] = words_from_database[i].encode('ascii','ignore')
            words_from_database[i] = words_from_database[i].replace("'", "") #todo - fix this shit
            words_from_database[i] += " "

        #add the words that were just downloaded from Facebook
        self.list_of_words.extend(self.load_words_from_json())
        words_from_json = self.load_words_from_json()
        
        #remove quotes because it breaks SQLite
        for i in range(0, len(words_from_json)):
            words_from_json[i] = words_from_json[i].encode('ascii','ignore')
            words_from_json[i] = words_from_json[i].replace("'", "") #todo - fix this shit
            words_from_json[i] += " "

        #remove duplicates
        from more_itertools import unique_everseen
        mega_list = self.list_of_words

        #normailze megalist for deleting duplicates
        mega_list = [string.strip() for string in mega_list]
        pruned_list = list(unique_everseen(mega_list))

        print "mega list - {0}".format(len(mega_list))
        print "pruned list - {0}".format(len(pruned_list))

        #remove quotes because it breaks SQLite
        for i in range(0, len(pruned_list)):
            pruned_list[i] = pruned_list[i].encode('ascii','ignore')
            pruned_list[i] = pruned_list[i].replace("'", "") #todo - fix this shit
            pruned_list[i] += " "

        database_connection.wipe_database()
        database_connection.add_list_to_database(pruned_list)
コード例 #58
0
ファイル: Spider.py プロジェクト: sebbekarlsson/webster
    def get_urls(self, soup):
        urls = []

        tags = soup.find_all(href=True) + soup.find_all(src=True)

        for tag in tags:
            if tag.get('href') is not None:
                if self.url not in tag.get('href'):
                    urls.append(urljoin(self.url, tag.get('href')))
                else:
                    urls.append(tag.get('href'))

            if tag.get('src') is not None:
                if self.url not in tag.get('src'):
                    urls.append(urljoin(self.url, tag.get('src')))
                else:
                    urls.append(tag.get('src'))

        return unique_everseen(urls)
コード例 #59
0
def reorder_by_paths(x, paths):
    """
    Reorder the columns activity matrix according to a set of
    the most probable paths through a network. Useful for visualizing repeated sequences.
    :param x: activity matrix (rows are timepoints, cols are nodes)
    :param paths: list of tuple paths
    :return: x with optimally permuted columns
    """

    # find all the nodes that appear in probable paths, ordered by those paths
    ordering = list(unique_everseen(chain.from_iterable(paths)))

    # add in the rest of the nodes randomly
    for node in range(x.shape[1]):
        if node not in ordering:
            ordering.append(node)

    # reorder the columns
    return x[:, np.array(ordering)], ordering
コード例 #60
0
ファイル: harvest.py プロジェクト: pieterprovoost/porcellio
def harvestobsids():

	baseurl = "http://waarnemingen.be/soort/view/%s?from=1900-01-01&to=2100-01-01&rows=100&page=%s"
	conn = sqlite3.connect("pissebed.db")
	c = conn.cursor()

	ids = c.execute("select id from species")
	ids = list(itertools.chain.from_iterable(ids))

	for id in ids:

		page = 1
		previous = []

		while True:
			try:
				url = baseurl % (id, page)
				res = br.open(url).read()
				obsids = list(unique_everseen(extractobsids(res)))
				print "Species " + str(id) + ", page " + str(page) + ": found " + str(len(obsids)) + " observations"

				if cmp(previous, obsids) == 0:
					break
				previous = obsids

				for obsid in obsids:
					records = c.execute("select * from observations where id = ?", (obsid,)).fetchall()
					if len(records) == 0:
						print "New observation: %s" % obsid
						c.execute("insert into observations (id, species_id) values (?, ?)", (obsid, id))
						conn.commit()
					else:
						print "Observation %s already exists" % obsid

			except Exception, e:
				print "Error: " + str(e)

			page = page + 1
			if page > maxpages:
				break
			time.sleep(random.randint(mindelay, maxdelay))