コード例 #1
0
ファイル: exponents.py プロジェクト: joewa/natu
    def update(self, *args, **kwargs):
        """D.update([E, ]**F) -> None.  Update D from dict/iterable E and F.
        If E present, does:  for k in E: D[k] = E[k]
        This is followed by: for k in F: D[k] = F[k]

        **Example:**

        >>> e = Exponents()
        >>> e.update(a=1)
        >>> e.update(dict(b=2), c=3)
        >>> e.update('1/d')
        >>> e # doctest: +SKIP
        Exponents({'a': 1, 'b': 2, 'c': 3, 'd': -1})

        .. testcleanup::
           >>> assert e == dict(a=1, b=2, c=3, d=-1)
        """
        try:
            # Assume args[0] is a string.
            arg = args[0].replace(' ', '')  # Remove spaces.
        except (IndexError, AttributeError):
            Counter.update(self, *args, **kwargs)
        else:
            if len(args) > 1:
                raise TypeError("update expected at most 1 arguments, got %i"
                                % len(args))
            Counter.update(self, Exponents.fromstr(arg), **kwargs)
コード例 #2
0
ファイル: inspect_features.py プロジェクト: jankim/qb
    def __setitem__(self, item, val):
        if len(self) > self._max * 2:
            print("Purging")
            to_delete = self.most_common()[self._max:]
            for ii in to_delete:
                del self[ii]

        Counter.__setitem__(self, item, val)
コード例 #3
0
ファイル: dimensions.py プロジェクト: speezepearson/units
 def __init__(self, _dict=None, **kwargs):
     if _dict is None:
         self.__init__(kwargs)
     else:
         if not all(isinstance(x, Rational) for x in _dict.values()):
             raise TypeError("powers of dimensions must be rational")
         Counter.__init__(self, _dict)
         self.clean()
コード例 #4
0
 def __init__(self, n, *args, **kwargs):
     assert n>=1
     self._size = n
     self._worst = None
     Counter.__init__(self, *args, **kwargs)
     if len(self)>=n:
         self._worst = self.most_common(n)[-1]
         for k,v in self.most_common()[n:]:
             del self[k]
コード例 #5
0
 def __setitem__(self, k, v):
     if k in self:
         assert self[k]==v
         return
     if self._worst is not None: # the beam is full
         if v<=self._worst[1]:
             return
         else:
             del self[self._worst[0]]
             
     Counter.__setitem__(self, k, v)
     assert self._size>=len(self)
     if self._size==len(self):
         self._worst = min(self.items(), key=lambda (k1,v1): v1)
コード例 #6
0
ファイル: main.py プロジェクト: eagle12td/dataMining
def wanabeknn(k=15):
	from collections import Counter
	ftrd = open("minidata/trainingData.csv")
	fted = open("minidata/testData.csv")
	flab = open("minidata/trainingLabels.csv")

	lab = [[int(j) for j in i.strip().split(",")]  for i in flab.readlines()]
	trd = [[int(j) for j in i.strip().split("\t")] for i in ftrd.readlines()]
	ted = [[int(j) for j in i.strip().split("\t")] for i in fted.readlines()]

	def dist(a,b): return sum([min(a[i], b[i]) for i in xrange(len(a))])

	rez = []
	for v in ted:
		print "hurej  %4d   %3d" % ( len(rez),len(rez[-1:]))
		t = []
		for trindex, train in enumerate(trd):
			t.append((dist(train, v), trindex))
		tt = sorted(t, reverse=True)
		ll = []
		for i in range(k): ll += lab[tt[i][1]]
		n = len(ll)
		for i in range(k/3): ll += lab[tt[i][1]]
		rez.append([x[0] for x in Counter.most_common(Counter(ll),n/k)])
		print rez
	cPickle.dump(rez, file("rezPickled/wnbknn%d.pickled" % k, "w"), -1)
コード例 #7
0
ファイル: data.py プロジェクト: zidarsk8/dataMining
def addFakeData(oData,oLabels,count=100,low=10):
	data = oData[:]
	labels = oLabels[:]
	for iafsa in range(count):
		c = Counter(chain(*labels))
		lc = Counter.most_common(c)
	
		dlc = {}
		for l in lc: dlc[l[0]] = l[1]
	
		#teze = [sum([ dlc[y]**2 for y in x])  for x in labels]
		teze = [sum([ dlc[y] for y in x])  for x in labels]
		teze = sorted([(y,x) for x,y in enumerate(teze)])
		tt = teze[:max(low*10,200)]
		shuffle(tt)
		duplicate = [x[1] for x in tt[:low]]
		dLabels = [labels[i][:] for i in duplicate]
		dData = [data[i][:] for i in duplicate]
		for ii in range(1):
			for i in range(len(duplicate)):
				labels.append(dLabels[i])
				data.append(dData[i])
	#shuflamo vrstice da niso vec lepo, pa poskrbimo da labele ostanejo 
	#pri svojem primeru
	sd = []
	[sd.append((data[i],labels[i])) for i in xrange(len(data))]
	shuffle(sd)
	ll = []
	dd = []
	for x,y in sd:
		dd.append(x)
		ll.append(y)
	return (dd, ll)
コード例 #8
0
def shortestCompletingWord(licensePlate, words):
	"""
	:type licensePlate: str
	:type words: List[str]
	:rtype: str
	"""
	d = {}
	licensePlate = licensePlate.lower()
	for c in licensePlate:
		if c.isalpha():
			d[c] = d.get(c, 0) + 1

	res1 = ''
	res2 = ''
	length1 = length2 = 20
	for word in words:
		n = len(word)
		all_in, flag = f(n, word, d)
		
		if all_in and n < length1:
			res1 = word
			length1 = n
		if flag and n < length2:
			res2 = word
			length2 = n
	if res1:
		return res1
	return res2
コード例 #9
0
ファイル: NGramSet.py プロジェクト: luguoqing/Diff-FSPM
    def __init__(self, max_len, N_max = 5):
        '''
        @summary: NGramSet Constructor

        @param max_len: 最优序列长度
        @param N_max: CCS2012 对应最大n-gram

        '''
        Counter.__init__(self)
        
        self.N_max = N_max
        self.max_len = max_len
        
        self.alphabet_size = 0    # 记录项总数
        self.all_record_num = 0   # 记录数据集中记录总数
        self.TERM = 0             # 序列结束符
コード例 #10
0
 def _most_preferred(self, alternatives):
     """Applies funcnamei from each trait to the alternatives and return the most preferred."""
     prefs = [y for y in [getattr(x, funcnamei)(alternatives) for x in self.traits] if y is not None]
     if not prefs:
         return None
     if len(prefs) == 1:
         return prefs[0]
     return Counter.most_common(Counter(prefs), 1)[0][0]
コード例 #11
0
ファイル: metrics.py プロジェクト: markus-beuckelmann/pattern
 def items(self, relative=False):
     """ Returns a list of (key, value)-tuples sorted by value, highest-first.
         With relative=True, the sum of values is 1.0.
     """
     a = Counter.most_common(self)
     if relative:
         n = sum(v for k, v in a) or 1.
         a = [(k, v / n) for v, k in a]
     return a
コード例 #12
0
ファイル: multiset.py プロジェクト: jvb/infobiotics-dashboard
 def __len__(self):
     '''Returns the total number of members, excluding repeated elements.
     
     >>> m = multiset('abb')
     >>> len(m)
     2
     
     '''
     return Counter.__len__(self)
コード例 #13
0
ファイル: visualizer.py プロジェクト: jairideout/qiime2
def most_common_viz(output_dir: str, ints: collections.Counter) -> None:
    df = pd.DataFrame(ints.most_common(), columns=["Integer", "Frequency"])

    with open(os.path.join(output_dir, 'index.html'), 'w') as fh:
        fh.write('<html><body>\n')
        fh.write('<h3>Most common integers:</h3>\n')
        fh.write(df.to_html(index=False))
        fh.write('</body></html>')

    with open(os.path.join(output_dir, 'index.tsv'), 'w') as fh:
        fh.write(df.to_csv(sep='\t', index=False))
コード例 #14
0
ファイル: metadict.py プロジェクト: theProphet/Pangaia
    def __add__(self, other):
        """Add together, with recursion.  The basic idea is that the set of keys
        should be added together and then recurse the addition to the values where keys are shared
        otherwise just add the value.

        >>> m, m2 = Mdict('a'), Mdict('abb')
        >>> m + m2 == {'a':2, 'b':2}
        True
        >>> m + 1 == {None:1, 'a':1}
        True
        >>> m['a'] = m2  #now fractal
        >>> m == {'a': {'a':1, 'b':2}}
        True
        >>> m + m + 1 == {'a': {'a':2, 'b':4}, None: 1}
        True
        """
        try: # need to add (union) of keys, and then recurse into values of common keys
            return Mdict(Counter.__add__(self, other)) #Counter.__add__(self, other)
        except TypeError: #must've been an integer for other
            return Mdict(Counter.__add__(self, Counter({None: other})))
コード例 #15
0
ファイル: data.py プロジェクト: zidarsk8/dataMining
def removeLeastCommonData(oData, oLabels, least=5):
	data = oData[:]
	labels = oLabels[:]
	c = Counter(chain(*labels))
	lc = Counter.most_common(c)
	bb = sorted(list(Set([j for i,j in lc])))
	a = [x[0] for x in lc if x[1] < bb[5]]
	rem = [i for i,j in enumerate(labels) if len(Set(j).intersection(Set(a))) > 0 ]
	[labels.pop(x) for x in sorted(rem, reverse=True)]
	[data.pop(x) for x in sorted(rem, reverse=True)]
	return (data, labels)
コード例 #16
0
ファイル: data.py プロジェクト: zidarsk8/dataMining
def removeMostCommonData(oData, oLabels, count=20):
	data = oData[:]
	labels = oLabels[:]
	for iafsa in range(count):
		c = Counter(chain(*labels))
		lc = Counter.most_common(c)
		dlc = {}
		for l in lc: dlc[l[0]] = l[1]
		teze = [max([ dlc[y] for y in x])  for x in labels]
		teze = sorted([(y,x) for x,y in enumerate(teze)])
		rem = [x[1] for x in teze[-10:]]
		[labels.pop(x) for x in sorted(rem, reverse=True)]
		[data.pop(x) for x in sorted(rem, reverse=True)]
	return (data, labels)
コード例 #17
0
 def __init__(self, iterable):
     self.__finishedinit = False
     Counter.__init__(self, iterable)
     self.__finishedinit = True
コード例 #18
0
 def __len__(self):
     return Counter.__len__(self)
コード例 #19
0
    def testSearcherSaveRestore(self):
        ray.init(num_cpus=8, local_mode=True)

        def create_searcher():
            class TestSuggestion(Searcher):
                def __init__(self, index):
                    self.index = index
                    self.returned_result = []
                    super().__init__(metric="episode_reward_mean", mode="max")

                def suggest(self, trial_id):
                    self.index += 1
                    return {"test_variable": self.index}

                def on_trial_complete(self, trial_id, result=None, **kwargs):
                    self.returned_result.append(result)

                def save(self, checkpoint_path):
                    with open(checkpoint_path, "wb") as f:
                        pickle.dump(self.__dict__, f)

                def restore(self, checkpoint_path):
                    with open(checkpoint_path, "rb") as f:
                        self.__dict__.update(pickle.load(f))

            searcher = TestSuggestion(0)
            searcher = ConcurrencyLimiter(searcher, max_concurrent=2)
            searcher = Repeater(searcher, repeat=3, set_index=False)
            search_alg = SearchGenerator(searcher)
            experiment_spec = {
                "run": "__fake",
                "num_samples": 20,
                "stop": {
                    "training_iteration": 2
                }
            }
            experiments = [Experiment.from_json("test", experiment_spec)]
            search_alg.add_configurations(experiments)
            return search_alg

        searcher = create_searcher()
        runner = TrialRunner(search_alg=searcher,
                             local_checkpoint_dir=self.tmpdir,
                             checkpoint_period=-1)
        for i in range(6):
            runner.step()

        assert len(
            runner.get_trials()) == 6, [t.config for t in runner.get_trials()]
        runner.checkpoint()
        trials = runner.get_trials()
        [
            runner.trial_executor.stop_trial(t) for t in trials
            if t.status is not Trial.ERROR
        ]
        del runner
        # stop_all(runner.get_trials())

        searcher = create_searcher()
        runner2 = TrialRunner(search_alg=searcher,
                              local_checkpoint_dir=self.tmpdir,
                              resume="LOCAL")
        assert len(runner2.get_trials()) == 6, [
            t.config for t in runner2.get_trials()
        ]

        def trial_statuses():
            return [t.status for t in runner2.get_trials()]

        def num_running_trials():
            return sum(t.status == Trial.RUNNING for t in runner2.get_trials())

        for i in range(6):
            runner2.step()
        assert len(set(trial_statuses())) == 1
        assert Trial.RUNNING in trial_statuses()
        for i in range(20):
            runner2.step()
            assert 1 <= num_running_trials() <= 6
        evaluated = [
            t.evaluated_params["test_variable"] for t in runner2.get_trials()
        ]
        count = Counter(evaluated)
        assert all(v <= 3 for v in count.values())
コード例 #20
0
ファイル: IO.py プロジェクト: boya-song/OpenNMT-py
def build_vocab(train_dataset_files, fields, data_type, share_vocab,
                src_vocab_path, src_vocab_size, src_words_min_frequency,
                tgt_vocab_path, tgt_vocab_size, tgt_words_min_frequency):
    """
    Args:
        train_dataset_files: a list of train dataset pt file.
        fields (dict): fields to build vocab for.
        data_type: "text", "img" or "audio"?
        share_vocab(bool): share source and target vocabulary?
        src_vocab_path(string): Path to src vocabulary file.
        src_vocab_size(int): size of the source vocabulary.
        src_words_min_frequency(int): the minimum frequency needed to
                include a source word in the vocabulary.
        tgt_vocab_path(string): Path to tgt vocabulary file.
        tgt_vocab_size(int): size of the target vocabulary.
        tgt_words_min_frequency(int): the minimum frequency needed to
                include a target word in the vocabulary.

    Returns:
        Dict of Fields
    """
    counter = {}
    for k in fields:
        counter[k] = Counter()

    # Load vocabulary
    src_vocab = None
    if len(src_vocab_path) > 0:
        src_vocab = set([])
        print('Loading source vocab from %s' % src_vocab_path)
        assert os.path.exists(src_vocab_path), \
            'src vocab %s not found!' % src_vocab_path
        with open(src_vocab_path) as f:
            for line in f:
                word = line.strip().split()[0]
                src_vocab.add(word)

    tgt_vocab = None
    if len(tgt_vocab_path) > 0:
        tgt_vocab = set([])
        print('Loading target vocab from %s' % tgt_vocab_path)
        assert os.path.exists(tgt_vocab_path), \
            'tgt vocab %s not found!' % tgt_vocab_path
        with open(tgt_vocab_path) as f:
            for line in f:
                word = line.strip().split()[0]
                tgt_vocab.add(word)

    for path in train_dataset_files:
        dataset = torch.load(path)
        print(" * reloading %s." % path)
        for ex in dataset.examples:
            for k in fields:
                val = getattr(ex, k, None)
                if val is not None and not fields[k].sequential:
                    val = [val]
                elif k == 'src' and src_vocab:
                    val = [item for item in val if item in src_vocab]
                elif k == 'tgt' and tgt_vocab:
                    val = [item for item in val if item in tgt_vocab]
                counter[k].update(val)

    _build_field_vocab(fields["tgt"], counter["tgt"],
                       max_size=tgt_vocab_size,
                       min_freq=tgt_words_min_frequency)
    print(" * tgt vocab size: %d." % len(fields["tgt"].vocab))

    # All datasets have same num of n_tgt_features,
    # getting the last one is OK.
    for j in range(dataset.n_tgt_feats):
        key = "tgt_feat_" + str(j)
        _build_field_vocab(fields[key], counter[key])
        print(" * %s vocab size: %d." % (key, len(fields[key].vocab)))

    if data_type == 'text':
        _build_field_vocab(fields["src"], counter["src"],
                           max_size=src_vocab_size,
                           min_freq=src_words_min_frequency)
        print(" * src vocab size: %d." % len(fields["src"].vocab))

        # All datasets have same num of n_src_features,
        # getting the last one is OK.
        for j in range(dataset.n_src_feats):
            key = "src_feat_" + str(j)
            _build_field_vocab(fields[key], counter[key])
            print(" * %s vocab size: %d." % (key, len(fields[key].vocab)))

        # Merge the input and output vocabularies.
        if share_vocab:
            # `tgt_vocab_size` is ignored when sharing vocabularies
            print(" * merging src and tgt vocab...")
            merged_vocab = merge_vocabs(
                [fields["src"].vocab, fields["tgt"].vocab],
                vocab_size=src_vocab_size)
            fields["src"].vocab = merged_vocab
            fields["tgt"].vocab = merged_vocab

    return fields
コード例 #21
0
def process_questions(args):
    ''' Encode question tokens'''
    print('Loading data')
    with open(args.annotation_file, 'r') as dataset_file:
        instances = json.load(dataset_file)

    # Either create the vocab or load it from disk
    if args.mode in ['train']:
        print('Building vocab')
        answer_cnt = {}
        for instance in instances:
            answer = instance['answer']
            answer_cnt[answer] = answer_cnt.get(answer, 0) + 1

        answer_token_to_idx = {'<UNK0>': 0, '<UNK1>': 1}
        answer_counter = Counter(answer_cnt)
        frequent_answers = answer_counter.most_common(args.answer_top)
        total_ans = sum(item[1] for item in answer_counter.items())
        total_freq_ans = sum(item[1] for item in frequent_answers)
        print("Number of unique answers:", len(answer_counter))
        print("Total number of answers:", total_ans)
        print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans))

        for token, cnt in Counter(answer_cnt).most_common(args.answer_top):
            answer_token_to_idx[token] = len(answer_token_to_idx)
        print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))

        question_token_to_idx = {'<NULL>': 0, '<UNK>': 1}
        for i, instance in enumerate(instances):
            question = instance['question'].lower()[:-1]
            for token in nltk.word_tokenize(question):
                if token not in question_token_to_idx:
                    question_token_to_idx[token] = len(question_token_to_idx)
        print('Get question_token_to_idx')
        print(len(question_token_to_idx))

        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
            'question_answer_token_to_idx': {'<NULL>': 0, '<UNK>': 1}
        }

        print('Write into %s' % args.vocab_json.format(args.dataset, args.dataset))
        with open(args.vocab_json.format(args.dataset, args.dataset), 'w') as f:
            json.dump(vocab, f, indent=4)
    else:
        print('Loading vocab')
        with open(args.vocab_json.format(args.dataset, args.dataset), 'r') as f:
            vocab = json.load(f)

    # Encode all questions
    print('Encoding data')
    questions_encoded = []
    questions_len = []
    question_ids = []
    video_ids_tbw = []
    video_names_tbw = []
    all_answers = []
    for idx, instance in enumerate(instances):
        question = instance['question'].lower()[:-1]
        question_tokens = nltk.word_tokenize(question)
        question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True)
        questions_encoded.append(question_encoded)
        questions_len.append(len(question_encoded))
        question_ids.append(idx)
        im_name = instance['video_id']
        video_ids_tbw.append(im_name)
        video_names_tbw.append(im_name)

        if instance['answer'] in vocab['answer_token_to_idx']:
            answer = vocab['answer_token_to_idx'][instance['answer']]
        elif args.mode in ['train']:
            answer = 0
        elif args.mode in ['val', 'test']:
            answer = 1

        all_answers.append(answer)
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    questions_len = np.asarray(questions_len, dtype=np.int32)
    print(questions_encoded.shape)

    glove_matrix = None
    if args.mode == 'train':
        token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()}
        print("Load glove from %s" % args.glove_pt)
        glove = pickle.load(open(args.glove_pt, 'rb'))
        dim_word = glove['the'].shape[0]
        glove_matrix = []
        for i in range(len(token_itow)):
            vector = glove.get(token_itow[i], np.zeros((dim_word,)))
            glove_matrix.append(vector)
        glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
        print(glove_matrix.shape)

    print('Writing', args.output_pt.format(args.dataset, args.dataset, args.mode))
    obj = {
        'questions': questions_encoded,
        'questions_len': questions_len,
        'question_id': question_ids,
        'video_ids': np.asarray(video_ids_tbw),
        'video_names': np.array(video_names_tbw),
        'answers': all_answers,
        'glove': glove_matrix,
    }
    with open(args.output_pt.format(args.dataset, args.dataset, args.mode), 'wb') as f:
        pickle.dump(obj, f)
コード例 #22
0
ファイル: containers.py プロジェクト: rodricios/wikicrawl
 def __init__(self, *args, **kwargs):
     Counter.__init__(self, *args, **kwargs)
コード例 #23
0
ファイル: EmoCount.py プロジェクト: lolgans/twitchSentiment
# print Counter(foundEmosAndSmilies)
# print("found smilies top50")
# print Counter.most_common(Counter(foundEmosAndSmilies), 50)
# print("total number of sentences with smilies:")
# print numOfTotalSentences

"""
Classify
"""
for message in messages:
    messagecounter += 1
    # for every sentence
    score, words = emoCount.score(message)

    foundEmosAndSmilies = foundEmosAndSmilies + words
    if len(words) != 0:
        numOfTotalSentences += 1
        print(words)


# in the end
print("Messages total:")
print messagecounter
print("found smilies total:")
print sum(Counter(foundEmosAndSmilies).values())
print("found smilies examples ordered:")
print Counter(foundEmosAndSmilies)
print("found smilies top50")
print Counter.most_common(Counter(foundEmosAndSmilies), 50)
print("total number of sentences with smilies:")
print numOfTotalSentences
コード例 #24
0
ファイル: 286148.py プロジェクト: 382335657/pythonHomework
def func(s1: str, s2: str):
    if s1 == s2:
        return 0
    if s1.startswith(s2):
        return 1
    if s2.startswith(s1):
        return -1
    for i in range(0, min(len(s1), len(s2))):
        if alphabet.index(s1[i]) < alphabet.index(s2[i]):
            return -1
        elif alphabet.index(s1[i]) > alphabet.index(s2[i]):
            return 1


ls = [input().strip() for x in range(0, int(input()))]
temp1 = [list(Counter(x).keys()) for x in ls]
alphabet = []
for x in temp1:
    alphabet = list(set(alphabet) | set(x))
#print(alphabet)
first = []
for x in range(0, len(alphabet)):
    ls = sorted(ls, key=cmp_to_key(func))
    first.append(ls[0])
    alphabet.append(alphabet[0])
    alphabet.remove(alphabet[0])
#print(first)
result = list(set(first))
print(len(result))
for x in ls:
    if x in result:
コード例 #25
0
ファイル: lca_utils.py プロジェクト: ricky-lim/sourmash
    def find(self, minhash, threshold, containment=False, ignore_scaled=False):
        """
        Do a Jaccard similarity or containment search.
        """
        # make sure we're looking at the same scaled value as database
        if self.scaled > minhash.scaled:
            minhash = minhash.downsample_scaled(self.scaled)
        elif self.scaled < minhash.scaled and not ignore_scaled:
            raise ValueError(
                "lca db scaled is {} vs query {}; must downsample".format(
                    self.scaled, minhash.scaled))

        if not hasattr(self, 'signatures'):
            debug('creating signatures for LCA DB...')
            sigd = defaultdict(minhash.copy_and_clear)

            for (k, v) in self.hashval_to_idx.items():
                for vv in v:
                    sigd[vv].add_hash(k)

            self.signatures = sigd

        debug('=> {} signatures!', len(self.signatures))

        # build idx_to_ident from ident_to_idx
        if not hasattr(self, 'idx_to_ident'):
            idx_to_ident = {}
            for k, v in self.ident_to_idx.items():
                idx_to_ident[v] = k

            self.idx_to_ident = idx_to_ident

        query_mins = set(minhash.get_mins())

        # collect matching hashes:
        c = Counter()
        for hashval in query_mins:
            idx_list = self.hashval_to_idx.get(hashval, [])
            for idx in idx_list:
                c[idx] += 1

        debug('number of matching signatures for hashes: {}', len(c))

        for idx, count in c.items():
            ident = self.idx_to_ident[idx]
            name = self.ident_to_name[ident]
            debug('looking at {} ({})', ident, name)

            match_mh = self.signatures[idx]
            match_size = len(match_mh)

            debug('count: {}; query_mins: {}; match size: {}', count,
                  len(query_mins), match_size)

            if containment:
                score = count / len(query_mins)
            else:
                score = count / (len(query_mins) + match_size - count)

            debug('score: {} (containment? {})', score, containment)

            if score >= threshold:
                # reconstruct signature... ugh.
                from .. import SourmashSignature
                match_sig = SourmashSignature(match_mh, name=name)

                yield score, match_sig, match_sig.md5sum(), self.filename, name
コード例 #26
0
def create_data_analysis_report(data_frame):


    real_news, fake_news = [news for _, news in data_frame.groupby(data_frame['is_sarcastic'] == 1)]
    logging.info("---------------------------------Shape of Real and Fake news in training data----------------------------------------------------------------------")
    logging.info(real_news.shape)
    logging.info(fake_news.shape)
    print("\n-------------------------------------------Shape of Real and Fake News in training data------------------------------------------------------------------------")
    print("\n Real News:",real_news.shape)
    print("\n Fake News:",fake_news.shape)



#<-------------------------------------------------EXPLORING ARTICLE HEADLINE TEXT---------------------------------------------------------------------------------------------->
    words_per_headline_plot_t = real_news["headline"].apply(lambda x: len(x.split()))
    stdev_t_head = statistics.stdev(words_per_headline_plot_t)
    words_per_headline_t = words_per_headline_plot_t.sum() / len(real_news["headline"])

    words_per_headline_plot_f = fake_news["headline"].apply(lambda x: len(x.split()))
    stdev_f_head = statistics.stdev(words_per_headline_plot_f)
    words_per_headline_f = words_per_headline_plot_f.sum() / len(fake_news["headline"])

    logging.info("\n--------------------------------------------------------Exploring Article Headline Text-----------------------------------------------------------")
    logging.info("\n-------------------------------------------------Average Number and Standard Deviation----------------------------------------")
    logging.info("\nThe average number of words in a real news Headline is :")
    logging.info(words_per_headline_t)
    logging.info("\nThe average number of words in a fake news Headline is :")
    logging.info(words_per_headline_f)
    logging.info("\nThe standard deviation in real news article lengths is:")
    logging.info(stdev_t_head)
    logging.info("\nThe standard deviation in fake news article lengths is :")
    logging.info(stdev_f_head)
    print("-------------------------------------------Averge number and Standard Deviation------------------------------------------------")
    print("The average number of words in a real news headline is ", words_per_headline_t)
    print("The average number of words in a fake news headline is ", words_per_headline_f)

    print("The standard deviation in real news articles' headline lengths is ", stdev_t_head)
    print("The standard deviation in fake news articles' headline lengths is ", stdev_f_head)


#Plotting the average and standard deviation diagram
    fig, ax = mpl.subplots(1, 2, figsize=(10, 6))
    words_per_headline_plot = sns.distplot(words_per_headline_plot_t, ax=ax[0], color="darkblue", rug=True).set_title(
        "Number of Words in Real News Headline")
    words_per_headline_plot = sns.distplot(words_per_headline_plot_f, ax=ax[1], color="red", rug=True).set_title(
        "Number of Words in Fake News Headline")
    mpl.show()
    words_per_headline_plot.figure.savefig("Data_Analysis_Plots_Directory\words_per_headline_plot.png")

#--------------------------------------------ARTICLE HEADLINE SENTIMENT ANALYSIS-----------------------------------------------------
    headline_polarity_true = pd.DataFrame(columns=["Headline", "sentiment"])
    for headline in real_news["headline"]:
        headline = TextBlob(headline)
        headline_polarity_true = headline_polarity_true.append(
            pd.Series([headline, headline.sentiment.polarity], index=headline_polarity_true.columns), ignore_index=True)


    headline_polarity_fake = pd.DataFrame(columns=["Headline", "sentiment"])
    for headline in fake_news["headline"]:
        headline = TextBlob(headline)
        headline_polarity_fake = headline_polarity_fake.append(
            pd.Series([headline, headline.sentiment.polarity], index=headline_polarity_fake.columns), ignore_index=True)

    headline_polarity_true_sm = statistics.mean(headline_polarity_true["sentiment"])
    headline_polarity_fake_sm = statistics.mean(headline_polarity_fake["sentiment"])


    logging.info(
        "\n-------------------------------------------Sentiment Analysis of Article Headline Text-----------------------------------------------------------")
    logging.info("\nThe headline sentiment analysis result for real_news :")
    logging.info(headline_polarity_true_sm)
    logging.info("\nThe headline sentiment analysis result for real_news : :")
    logging.info(headline_polarity_fake_sm)
    logging.info(
        "\nPlotting headline_sentiment_plot and saved at Data_Analysis_Plots_Directory\headline_sentiment_analysis_plot:")

    fig, ax = mpl.subplots(1, 2, figsize=(10, 6))
    headline_sentiment_plot = sns.distplot(headline_polarity_true["sentiment"], ax=ax[0], color="darkblue",
                                           rug=True).set_title("Real News Headline Sentiments")
    headline_sentiment_plot = sns.distplot(headline_polarity_fake["sentiment"], ax=ax[1], color="red", rug=True).set_title(
        "Fake News Headline Sentiments")
    mpl.show()
    headline_sentiment_plot.figure.savefig("Data_Analysis_Plots_Directory\headline_sentiment_analysis_plot.png")

# #---------------------------------------Computing bigrams in Real News headline-------------------------------------------------------------------
    lemmatizer = WordNetLemmatizer()


    words_in_real_news_headline = []  # all tokens in true articles
    words_in_fake_news_headline = []

    words_in_real_news_headline_with_no_stopwords = []  # all tokens in true articles
    words_in_fake_news_headline_with_no_stopwords = []  # all tokens in fake articles



#--------------------------------------------------Processinng ngram--------------------------------------------------------------------------------

    process(real_news, words_in_real_news_headline)
    process(fake_news, words_in_fake_news_headline)

    bigrams_real_news_headline = zip(words_in_real_news_headline, words_in_real_news_headline[1:])
    bigram_counts_real_news_headline = Counter(bigrams_real_news_headline)
    df = pd.DataFrame(bigram_counts_real_news_headline.most_common(20), columns=["Bigram_Real_News", "Frequency"])
    bigrams_real_news_headline = df
    logging.info(bigrams_real_news_headline)


    df.sort_values(by='Frequency', ascending=False)

    df.plot.barh(x='Bigram_Real_News', y='Frequency', title="Top Bigrams in Real News Headline").invert_yaxis()
    mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_Bigrams_plot.png", bbox_inches = "tight")
    mpl.show()






#---------------------------------------Computing bigrams in Fake News headline-------------------------------------------------------------------

    bigrams_fake_news_headline = zip(words_in_fake_news_headline, words_in_fake_news_headline[1:])
    bigram_counts_fake_news_headline = Counter(bigrams_fake_news_headline)
    df = pd.DataFrame(bigram_counts_fake_news_headline.most_common(20), columns=["Bigram_Fake_News", "Frequency"])
    bigrams_fake_news_headline = df
    logging.info(bigrams_fake_news_headline)


    df.sort_values(by='Frequency', ascending=False)

    df.plot.barh(x='Bigram_Fake_News', y='Frequency', title="Top Bigram in Fake News Headline").invert_yaxis()
    mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_Bigrams_plot.png", bbox_inches="tight")
    mpl.show()



#---------------------------------------Computing trigrams in real news headline-------------------------------------------------------------------

    trigrams_real_news_headline = zip(words_in_real_news_headline, words_in_real_news_headline[1:], words_in_real_news_headline[2:])
    trigram_counts_real_news_headline = Counter(trigrams_real_news_headline)
    df = pd.DataFrame(trigram_counts_real_news_headline.most_common(20), columns=["Trigram_Real_News", "Frequency"])
    trigrams_real_news_headline = df
    logging.info(trigrams_real_news_headline)


    df.sort_values(by='Frequency', ascending=False)

    df.plot.barh(x='Trigram_Real_News', y='Frequency', title="Top Tigrams in Real News Headline").invert_yaxis()
    mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_Trigrams_plot.png", bbox_inches="tight")
    mpl.show()

#---------------------------------------Computing trigrams in fake news headline-------------------------------------------------------------------
    trigrams_fake_news_headline = zip(words_in_fake_news_headline, words_in_fake_news_headline[1:], words_in_fake_news_headline[2:])
    trigram_counts_fake_news_headline = Counter(trigrams_fake_news_headline)
    df = pd.DataFrame(trigram_counts_fake_news_headline.most_common(20), columns=["Trigram_Fake_News", "Frequency"])
    trigrams_fake_news_headline = df
    logging.info(trigrams_fake_news_headline)


    df.sort_values(by='Frequency', ascending=False)

    df.plot.barh(x='Trigram_Fake_News', y='Frequency', title="Top Trigrams in Fake News Headline").invert_yaxis()
    mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_Trigrams_plot.png", bbox_inches="tight")
    mpl.show()

#---------------------------------------Computing unigram in real news headline-------------------------------------------------------------------

    wordcounts_r = Counter(words_in_real_news_headline)
    mostcommon_r = Counter(wordcounts_r).most_common(20)
    df = pd.DataFrame(mostcommon_r, columns=["Unigram_Real_News", "Frequency"])
    logging.info(df)


    df.sort_values(by='Frequency', ascending=False)

    df.plot.barh(x='Unigram_Real_News', y='Frequency', title="Top Unigram in Real News Headline").invert_yaxis()
    mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_unigrams_plot.png", bbox_inches="tight")
    mpl.show()

    r_plot = dict(mostcommon_r)
    mostcommon_r = df.reset_index(drop=True)
    mostcommon_r = df['Unigram_Real_News'].tolist()

    r_wc = WordCloud(max_words=25,relative_scaling=1,background_color ='white', normalize_plurals=False).generate_from_frequencies(r_plot)

    mpl.imshow(r_wc)
    mpl.title("Plot of Most Frequent Words in Real News")
    mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_unigrams_wc_plot.png", bbox_inches="tight")
    mpl.show()

#---------------------------------------Computing unigram in fake news headline-------------------------------------------------------------------



    wordcounts_f = Counter(words_in_fake_news_headline)
    mostcommon_f = Counter(wordcounts_f).most_common(20)
    df = pd.DataFrame(mostcommon_f, columns=["Unigram_Fake_News", "Frequency"])
    logging.info(df)


    df.sort_values(by='Frequency', ascending=False)

    df.plot.barh(x='Unigram_Fake_News', y='Frequency', title="Top Unigram in Fake News Headline").invert_yaxis()
    mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_unigrams_plot.png", bbox_inches="tight")
    mpl.show()

    f_plot = dict(mostcommon_f)
    mostcommon_f = df.reset_index(drop=True)
    mostcommon_f = df['Unigram_Fake_News'].tolist()

    f_wc = WordCloud(max_words=25,relative_scaling=1,background_color ='white', normalize_plurals=False).generate_from_frequencies(f_plot)

    mpl.imshow(f_wc)
    mpl.title("Plot of Most Frequent Words in Fake News")
    mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_unigrams_wc_plot.png", bbox_inches="tight")
    mpl.show()

#Of the top 20 words in each class, 9 words are common
    logging.info("-------------------------Of the top 20 words in each class, Number of words  that are common------------------------------------------")
    logging.info(len(set(mostcommon_r) & set(mostcommon_f)))
#--------------------------------------------------Processinng ngram with no stop words--------------------------------------------------------------------------------

    process_no_stopwords(real_news, words_in_real_news_headline_with_no_stopwords)
    process_no_stopwords(fake_news, words_in_fake_news_headline_with_no_stopwords)

#---------------------------------------Computing bigrams in Real News headline with no stop word-------------------------------------------------------------------

    bigrams_real_news_headline = zip(words_in_real_news_headline_with_no_stopwords, words_in_real_news_headline_with_no_stopwords[1:])
    bigram_counts_real_news_headline = Counter(bigrams_real_news_headline)
    df = pd.DataFrame(bigram_counts_real_news_headline.most_common(20), columns=["Bigram_Real_News_with_no_stopwords", "Frequency"])
    bigrams_real_news_headline = df
    logging.info(bigrams_real_news_headline)


    df.sort_values(by='Frequency', ascending=False)

    df.plot.barh(x='Bigram_Real_News_with_no_stopwords', y='Frequency', title="Top Bigrams in Real News Headline with no stop words").invert_yaxis()
    mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_Bigrams_with_no_stop_words_plot.png", bbox_inches = "tight")
    mpl.show()






#---------------------------------------Computing bigrams in Fake News headline with no stop word-------------------------------------------------------------------

    bigrams_fake_news_headline = zip(words_in_fake_news_headline_with_no_stopwords, words_in_fake_news_headline_with_no_stopwords[1:])
    bigram_counts_fake_news_headline = Counter(bigrams_fake_news_headline)
    df = pd.DataFrame(bigram_counts_fake_news_headline.most_common(20), columns=["Bigram_Fake_News_no_stopwords", "Frequency"])
    bigrams_fake_news_headline = df
    logging.info(bigrams_fake_news_headline)


    df.sort_values(by='Frequency', ascending=False)

    df.plot.barh(x='Bigram_Fake_News_no_stopwords', y='Frequency', title="Top Bigram in Fake News Headline with no stop words").invert_yaxis()
    mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_Bigrams_no_stopwords_plot.png", bbox_inches="tight")
    mpl.show()



#---------------------------------------Computing trigrams in real news headline with no stop words-------------------------------------------------------------------

    trigrams_real_news_headline = zip(words_in_real_news_headline_with_no_stopwords, words_in_real_news_headline_with_no_stopwords[1:],words_in_real_news_headline_with_no_stopwords[2:])
    trigram_counts_real_news_headline = Counter(trigrams_real_news_headline)
    df = pd.DataFrame(trigram_counts_real_news_headline.most_common(20), columns=["Trigram_Real_News_no_stopwords", "Frequency"])
    trigrams_real_news_headline = df
    logging.info(trigrams_real_news_headline)
    df.sort_values(by='Frequency', ascending=False)
    df.plot.barh(x='Trigram_Real_News_no_stopwords', y='Frequency', title="Top Tigrams in Real News Headline with no stop words").invert_yaxis()
    mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_Trigrams_no_stopwords_plot.png", bbox_inches="tight")
    mpl.show()

#---------------------------------------Computing trigrams in fake news headline with no stop words-------------------------------------------------------------------
    trigrams_fake_news_headline = zip(words_in_fake_news_headline_with_no_stopwords, words_in_fake_news_headline_with_no_stopwords[1:], words_in_fake_news_headline_with_no_stopwords[2:])
    trigram_counts_fake_news_headline = Counter(trigrams_fake_news_headline)
    df = pd.DataFrame(trigram_counts_fake_news_headline.most_common(20), columns=["Trigram_Fake_News_no_stopwords", "Frequency"])
    trigrams_fake_news_headline = df
    logging.info(trigrams_fake_news_headline)


    df.sort_values(by='Frequency', ascending=False)

    df.plot.barh(x='Trigram_Fake_News_no_stopwords', y='Frequency', title="Top Trigrams in Fake News Headline with no stop words").invert_yaxis()
    mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_Trigrams_no_stopwords_plot.png", bbox_inches="tight")
    mpl.show()

#---------------------------------------Computing unigram in real news headline with no stop words-------------------------------------------------------------------

    wordcounts_r = Counter(words_in_real_news_headline_with_no_stopwords)
    mostcommon_r = Counter(wordcounts_r).most_common(20)
    df = pd.DataFrame(mostcommon_r, columns=["Unigram_Real_News_no_stopwords", "Frequency"])
    logging.info(df)
    df.sort_values(by='Frequency', ascending=False)
    df.plot.barh(x='Unigram_Real_News_no_stopwords', y='Frequency', title="Top Unigram in Real News Headline with no stop words").invert_yaxis()
    mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_unigrams_no_stopwords_plot.png", bbox_inches="tight")
    mpl.show()

    r_plot = dict(mostcommon_r)
    mostcommon_r = df.reset_index(drop=True)
    mostcommon_r = df['Unigram_Real_News_no_stopwords'].tolist()

    r_wc = WordCloud(max_words=25, relative_scaling=1, background_color='white',
                     normalize_plurals=False).generate_from_frequencies(r_plot)

    mpl.imshow(r_wc)
    mpl.title("Plot of Most Frequent Words with no stop words in Real News")
    mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_unigrams_with_o_stopwords_wc_plot.png", bbox_inches="tight")
    mpl.show()

#---------------------------------------Computing unigram in fake news headline with stop words-------------------------------------------------------------------

    wordcounts_f = Counter(words_in_fake_news_headline_with_no_stopwords)
    mostcommon_f = Counter(wordcounts_f).most_common(20)
    df = pd.DataFrame(mostcommon_f, columns=["Unigram_Fake_News_no_stopwords", "Frequency"])
    logging.info(df)
    df.sort_values(by='Frequency', ascending=False)
    df.plot.barh(x='Unigram_Fake_News_no_stopwords', y='Frequency', title="Top Unigram in Fake News Headline no_stopwords").invert_yaxis()
    mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_unigrams_no_stopwords_plot.png", bbox_inches="tight")
    mpl.show()

    f_plot = dict(mostcommon_f)
    mostcommon_f = df.reset_index(drop=True)
    mostcommon_f = df['Unigram_Fake_News_no_stopwords'].tolist()

    f_wc = WordCloud(max_words=25, relative_scaling=1, background_color='white',
                     normalize_plurals=False).generate_from_frequencies(f_plot)

    mpl.imshow(f_wc)
    mpl.title("Plot of Most Frequent Words wit no stop words in Fake News")
    mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_unigrams_no_stopwords_wc_plot.png", bbox_inches="tight")
    mpl.show()
コード例 #27
0
ファイル: program.py プロジェクト: asherif844/100DaysOfCode
import re
text = 'Everything is awesome, everything is cool when you are part of the team.  Everything is awesome, when you are living the dream'

# print('Everything is awesome' in text)
# print(text.replace('dream', 'scream'))

text2 = '''
$ python module_index.py |grep ^re
re                 | stdlib | 005, 007, 009, 015, 021, 022, 068, 080, 081, 086, 095
'''

# print(re.findall(r'\d+', text2))

text3 = """Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been 
the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and 
scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into 
electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of
Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus
PageMaker including versions of Lorem Ipsum"""

# print(text3.split())
# print(re.findall(r'[A-Z][a-z0-9]+', text3))

from collections import Counter

cnt = Counter(re.findall(r'[A-Z][a-z0-9]+', text3))

print(cnt)
コード例 #28
0
 def getMostUsedDefiFormats(self, count=None):
     return Counter([entry.getDefiFormat()
                     for entry in self]).most_common(count)
コード例 #29
0
def main(args):

    data_file_name = get_file_name(args.data_path, args.data_file)

    tweet_data = dict()
    metrics = {
        'words': list(),
        'unfair': list(),
        'times': list(),
        'source': list()
    }

    with open(data_file_name, 'r') as data_file:
        for line in data_file:
            tweet = json.loads(line)

            status = tweet['status']

            # tweet ID
            tweet_data.setdefault(status['id'], dict())

            # static fields
            target_field, source_field = 'created at', 'created_at'
            if target_field in tweet_data[status['id']]:
                assert tweet_data[
                    status['id']][target_field] == status[source_field]
            else:
                tweet_data[status['id']][target_field] = status[source_field]

            target_field, source_field = 'text', 'text'
            if target_field in tweet_data[status['id']]:
                assert tweet_data[
                    status['id']][target_field] == status[source_field]
            else:
                tweet_data[status['id']][target_field] = status[source_field]

            target_field, source_field = 'source', 'source'
            if target_field in tweet_data[status['id']]:
                assert tweet_data[
                    status['id']][target_field] == status[source_field]
            else:
                tweet_data[status['id']][target_field] = status[source_field]

            # dynamic fields
            target_field, source_field = 'rt', 'retweet_count'
            tweet_data[status['id']].setdefault(target_field, dict())
            tweet_data[status['id']][target_field][
                tweet['collected at']] = status[source_field]

    for id, tweet in tweet_data.iteritems():
        norm_words = normalized(tweet['text'])
        metrics['words'].extend(norm_words)
        metrics['unfair'].append(is_negative(norm_words))
        metrics['times'].append(tweet['created at'])
        metrics['source'].append(tweet['source'])

        if tweet[
                'source'] == '<a href="https://ads.twitter.com" rel="nofollow">Twitter Ads</a>':
            print '>>>>>>>', tweet['created at'], tweet['text']

    n_words = 25
    print 'Top ' + str(
        n_words) + ' most used words (excludes some common words like "the").'
    for word, count in Counter(metrics['words']).most_common(n_words):
        if count > 0:
            print word, '(' + str(count) + ')'

    print

    day_parts = {
        'you should really get some sleep':
        (datetime.time(0, 0, 0), datetime.time(4, 0, 0)),
        'early morning': (datetime.time(4, 0, 1), datetime.time(8, 0, 0)),
        'morning': (datetime.time(8, 0, 1), datetime.time(12, 0, 0)),
        'afternoon': (datetime.time(12, 0, 1), datetime.time(16, 0, 0)),
        'evening': (datetime.time(16, 0, 1), datetime.time(20, 0, 0)),
        'late night': (datetime.time(20, 0, 1), datetime.time(23, 59, 59))
    }

    tweet_day_parts = list()
    for t in metrics['times']:
        for day_part, (start, end) in day_parts.iteritems():
            if start <= datetime.datetime.strptime(
                    t, '%a %b %d %H:%M:%S +0000 %Y').time() <= end:
                tweet_day_parts.append(day_part)
                break

    print 'Tweet frequency by day part'
    for day_part, count in Counter(tweet_day_parts).most_common():
        print day_part, '(' + str(count) + ')'

    print

    print 'Unfair-o-meter: # unfair tweets, # total tweets, % unfair tweets'
    n_unfair = sum(metrics['unfair'])
    n_tweets = len(metrics['unfair'])
    print n_unfair, n_tweets, str(
        100 * round(n_unfair / float(n_tweets), 1)) + '%'

    print

    total = 0
    for source, count in Counter(metrics['source']).most_common():
        total += count
        print source, count
    print 'total tweets:', total
コード例 #30
0
 def __iter__(self):
     return Counter.__iter__(self)
コード例 #31
0
@author: sanyuktabaluni
"""


from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from collections import Counter

plt.style.use('seaborn')

df=pd.read_csv("data.csv")

df["LanguagesWorkedWith"]=df["LanguagesWorkedWith"].apply(lambda x: x.split(";"))

print(Counter(df["LanguagesWorkedWith"].iloc[1]))

c=Counter()

for row in df["LanguagesWorkedWith"]:
    c.update(row)
    
print(c)

languages=[]
popularity=[]

for s in c.most_common(15):
    languages.append(s[0])
    popularity.append(s[1])
コード例 #32
0
with open("../input/9.txt") as f:
    data = [list(map(int, line[:-1])) for line in f.readlines()]

part_1 = 0
basin = 0
seen = {}
stack = []
for r in range(len(data)):
    for c in range(len(data[0])):
        if all(r + dr < 0 or r + dr >= len(data) or c + dc < 0 or c +
               dc >= len(data[0]) or data[r][c] < data[r + dr][c + dc]
               for dr, dc in ((0, -1), (0, 1), (-1, 0), (1, 0))):
            part_1 += 1 + data[r][c]

        if (r, c) not in seen and data[r][c] != 9:
            stack.append((r, c))
            while stack:
                r, c = stack.pop()
                for dr, dc in ((0, -1), (0, 1), (-1, 0), (1, 0)):
                    r_ = r + dr
                    c_ = c + dc
                    if 0 <= r_ < len(data) and 0 <= c_ < len(data[0]):
                        if (r_, c_) not in seen and data[r_][c_] != 9:
                            seen[(r_, c_)] = basin
                            stack.append((r_, c_))
            basin += 1

print(part_1)
a, b, c = Counter(list(seen.values())).most_common(3)
print(a[1] * b[1] * c[1])
コード例 #33
0
    print books
    print len(books)


def get_year(book):
    """book["date"] looks like 'November 2014' so we need to

    split on the space and then take the second piece"""

    return int(book["date"].split()[1])

    # 2014 is the last complete year of data (when I ran this)


year_counts = Counter(
    get_year(book) for book in books if get_year(book) <= 2017)

years = sorted(year_counts)

book_counts = [year_counts[year] for year in years]

plt.plot(years, book_counts)

plt.ylabel("# of data books")

plt.title("Data is Big!")

plt.show()

serialized = """{ "title" : "Data Science Book",
コード例 #34
0
def collect_comments_top(comments, company_id):
    comments_top = Counter()
    for commenters in comments.values():
        comments_top.update(commenters)
    del comments_top[company_id]
    return dict(comments_top.most_common())
コード例 #35
0
 def _counter(self):  # Non-public method
     return Counter(self.text)
コード例 #36
0
def simulate(pocket, step, count):
    for _ in range(count):
        pocket = expand(pocket)
        pocket = step(pocket)
    return pocket

puzzle_input = """
##.#####
#.##..#.
.##...##
###.#...
.#######
##....##
###.###.
.#.#.#..
"""

data = [[elt for elt in line] for line in puzzle_input.strip().split()]

# pocket[z][y][x]
pocket = defaultdict(lambda: defaultdict(dict))
for y, row in enumerate(data):
    for x, value in enumerate(row):
        pocket[0][y][x] = value

print(pocket)

bootup_pocket = simulate(pocket, step_part1, count=6)

print(Counter(bootup_pocket[z][y][x] for z in bootup_pocket for y in bootup_pocket[z] for x in bootup_pocket[z][y]))
コード例 #37
0
ファイル: descriptive_stats.py プロジェクト: pnnl/HyperNetX
def dist_stats(H):
    """
    Computes many basic hypergraph stats and puts them all into a single dictionary object

        * nrows = number of nodes (rows in the incidence matrix)
        * ncols = number of edges (columns in the incidence matrix)
        * aspect ratio = nrows/ncols
        * ncells = number of filled cells in incidence matrix
        * density = ncells/(nrows*ncols)
        * node degree list = degree_dist(H)
        * node degree dist = centrality_stats(degree_dist(H))
        * node degree hist = Counter(degree_dist(H))
        * max node degree = max(degree_dist(H))
        * edge size list = edge_size_dist(H)
        * edge size dist = centrality_stats(edge_size_dist(H))
        * edge size hist = Counter(edge_size_dist(H))
        * max edge size = max(edge_size_dist(H))
        * comp nodes list = s_comp_dist(H, s=1, edges=False)
        * comp nodes dist = centrality_stats(s_comp_dist(H, s=1, edges=False))
        * comp nodes hist = Counter(s_comp_dist(H, s=1, edges=False))
        * comp edges list = s_comp_dist(H, s=1, edges=True)
        * comp edges dist = centrality_stats(s_comp_dist(H, s=1, edges=True))
        * comp edges hist = Counter(s_comp_dist(H, s=1, edges=True))
        * num comps = len(s_comp_dist(H))

    Parameters
    ----------
    H : Hypergraph

    Returns
    -------
     dist_stats : dict
        Dictionary which keeps track of each of the above items (e.g., basic['nrows'] = the number of nodes in H)
    """
    stats = H.state_dict.get("dist_stats", None)
    if stats is not None:
        return H.state_dict["dist_stats"]
    else:
        cstats = ["min", "max", "mean", "median", "std"]
        basic = dict()

        # Number of rows (nodes), columns (edges), and aspect ratio
        basic["nrows"] = len(H.nodes)
        basic["ncols"] = len(H.edges)
        basic["aspect ratio"] = basic["nrows"] / basic["ncols"]

        # Number of cells and density
        M = H.incidence_matrix(index=False)
        basic["ncells"] = M.nnz
        basic["density"] = basic["ncells"] / (basic["nrows"] * basic["ncols"])

        # Node degree distribution
        basic["node degree list"] = sorted(degree_dist(H), reverse=True)
        basic["node degree centrality stats"] = dict(
            zip(cstats, centrality_stats(basic["node degree list"]))
        )
        basic["node degree hist"] = Counter(basic["node degree list"])
        basic["max node degree"] = max(basic["node degree list"])

        # Edge size distribution
        basic["edge size list"] = sorted(H.edge_size_dist(), reverse=True)
        basic["edge size centrality stats"] = dict(
            zip(cstats, centrality_stats(basic["edge size list"]))
        )
        basic["edge size hist"] = Counter(basic["edge size list"])
        basic["max edge size"] = max(basic["edge size hist"])

        # Component size distribution (nodes)
        basic["comp nodes list"] = sorted(s_comp_dist(H, edges=False), reverse=True)
        basic["comp nodes hist"] = Counter(basic["comp nodes list"])
        basic["comp nodes centrality stats"] = dict(
            zip(cstats, centrality_stats(basic["comp nodes list"]))
        )

        # Component size distribution (edges)
        basic["comp edges list"] = sorted(s_comp_dist(H, edges=True), reverse=True)
        basic["comp edges hist"] = Counter(basic["comp edges list"])
        basic["comp edges centrality stats"] = dict(
            zip(cstats, centrality_stats(basic["comp edges list"]))
        )

        # Number of components
        basic["num comps"] = len(basic["comp nodes list"])

        # # Diameters
        # basic['s edge diam list'] = s_edge_diameter_dist(H)
        # basic['s node diam list'] = s_node_diameter_dist(H)
        if H.isstatic:
            H.set_state(dist_stats=basic)
        return basic
コード例 #38
0
ファイル: FreqAnalysis.py プロジェクト: DanGonite57/CrPyto
def getFrequencies(text):
    """Return a Counter() object of text."""

    return Counter(text)
コード例 #39
0
def _count_cards(cards: Iterable[Card]) -> Tuple[int, ...]:
    counter = Counter(cards)
    return tuple(counter[card] for card in Card)
コード例 #40
0
 def frequencySort(self, s):
     cnt = Counter(s)
     ans = []
     for k, v in sorted(cnt.items(), key = lambda item: item[1], reverse = True):
         ans.append(k * v)
     return "".join(ans)
コード例 #41
0
ファイル: counter.py プロジェクト: geniphi/findig
 def __init__(self, duration, _): # last argument is resource (or None), but it is unused.
     self._hits = []
     self._delta = duration if isinstance(duration, timedelta) \
                            else timedelta(seconds=duration)
     self._thread_lock = Lock()
     self._counter = PyCounter()
コード例 #42
0
ファイル: gp.py プロジェクト: boliqq07/BGP
def mutDifferentReplacementVerbose(individual, pset, personal_map=False):
    """
    choice terminals_and_constants verbose
    Replaces a randomly chosen primitive from *individual* by a randomly
    chosen primitive with the same number of arguments from the :attr:`pset`
    attribute of the individual.
    decrease the probability of same terminals.

    :param individual: The normal or typed tree to be mutated.
    :param pset: SymbolSet
    :param personal_map: bool

    :returns: A tuple of one tree.
    """

    if len(individual) < 4:
        return individual,

    individual = copy.copy(individual)
    ters = [repr(i) for i in individual.terminals()]
    pset_ters = [repr(i) for i in pset.terminals_and_constants]
    cou = Counter(ters)
    cou_mutil = {i: j for i, j in cou.items() if j >= 2}
    ks = list(cou_mutil.keys())
    nks = list(set(pset_ters) - (set(ks)))
    if len(nks) <= 1:
        return individual,

    nks.sort()  # very import for random

    p_nks = np.array([pset.prob_ter_con[i] for i in nks])
    p_nks = p_nks.astype(float)
    p_nks /= np.sum(p_nks)

    if cou_mutil:
        indexs = []
        for k, v in cou_mutil.items():
            indi = []
            for i in np.arange(1, len(individual), 2):
                if repr(individual[i]) == k:
                    indi.append(i)
            if indi:
                indexs.append(random.choice(indi))

        if personal_map:
            p_nks_new = pset.premap.get_nodes_value(ind=individual, pset=pset, node=None, site=indexs)
            if p_nks_new is not None:
                nks = list(pset.prob_ter_con.keys())
                p_nks = p_nks_new

        if len(indexs) <= len(nks):
            term = random.choice(nks, len(indexs), replace=False, p=p_nks)
        else:
            term = random.choice(nks, len(indexs), replace=True, p=p_nks)

        term_ters = []
        for name in term:
            for i in pset.terminals_and_constants:
                if repr(i) == name:
                    term_ters.append(i)

        for o, n in zip(indexs, term_ters):
            individual[o] = n

    return individual,
コード例 #43
0
 def __delitem__(self, *args, **kwargs):
     if self.__finishedinit:
         raise AttributeError("Can't change a frozen counter!")
     Counter.__delitem__(self, *args, **kwargs)
コード例 #44
0
ファイル: counter.py プロジェクト: aruntakkar/PyCode
from collections import Counter

text = "In February 2014, I made a recommendation to my co - founders at" \
    "Ballistiq that I wanted to cancel development of ArtStation." \
    "The project was in development hell. It wasn’t going anywhere." \
    "I was unhappy with it and just couldn’t see a path for it to be a"\
    "successful product. Two months later we managed to launch it," \
    "and two years later it is the leading network for professional games."

words = text.split()

Counter = Counter(words)

top_three = Counter.most_common(3)

print(top_three)
コード例 #45
0
 def __getitem__(self, key):
     return Counter.__getitem__(self, key)
コード例 #46
0
 def __init__(self, *args, **kwargs):
     AttrDict.__init__(self, *args, **kwargs)
     Counter.__init__(self)
     self.__exclude_keys__ |= {'most_common'}
コード例 #47
0
from sys import stdin as Si, maxsize as m
from math import floor as F
from collections import defaultdict as dt, Counter as Co
from operator import itemgetter as ig
from math import pi

if __name__ == "__main__":
    L = tuple(map(int, Si.readline().split()))
    H, Max = Co(L), 0
    for k, v in H.items():
        if v > 1:
            Max = max(Max, k * min(v, 3))
    print(sum(L) - Max)

"""
A. Bear and Five Cards
time limit per test
2 seconds
memory limit per test
256 megabytes
input
standard input
output
standard output

A little bear Limak plays a game. He has five cards. There is one number written on each card. Each number is a positive integer.

Limak can discard (throw out) some cards. His goal is to minimize the sum of numbers written on remaining (not discarded) cards.

He is allowed to at most once discard two or three cards with the same number. Of course, he won't discard cards if it's impossible to choose two or three cards with the same number.
コード例 #48
0
 def __init__(self,Config):
     self.preprocessor = Preprocessor()
     self.cfg = Config()
     self.word_counter = Counter()
     self.words_dict = {}
コード例 #49
0
 def __init__(self, word_list = []):
     self.word_list = word_list
     Counter.__init__(self, word_list)
コード例 #50
0
    return Article(web_link=web_link, blog_link=blog_link, tags=tags,
                   time_added=time_added, year=year, month=month, date=date,
                   hour=hour, title=title)


lists = map(parse_article, article_list)
article_date_list = []  # 2018-03
article_time_list = []  # 12:24
article_blog_link = []
for l in lists:
    time_string = l.time_added
    article_date_list.append(str(time_string)[0:7])
    article_time_list.append(int(str(time_string)[11:13]))
    article_blog_link.append(l.blog_link)

article_date_list = Counter(article_date_list)
article_time_list = Counter(article_time_list)
article_blog_link_list = Counter(article_blog_link)

article_blog_link_other_list = []
count = 0
article_blog_link_final_list = dict()
for k in article_blog_link_list.keys():
    v = article_blog_link_list.get(k)
    if v <= 10:
        count += v
    else:
        article_blog_link_final_list[k] = v


def dict2list(dic: dict):
コード例 #51
0
 def most_common(self, n, conts):
     """Returns most frequent word"""
     return Counter.most_common(conts)
コード例 #52
0
def main():
    """
    Helper script to encode raw text with the GPT-2 BPE using multiple processes.

    The encoder.json and vocab.bpe files can be obtained here:
    - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
    - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
    """
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--encoder-json",
        help="path to encoder.json",
    )
    parser.add_argument(
        "--vocab-bpe",
        type=str,
        help="path to vocab.bpe",
    )
    parser.add_argument(
        "--inputs",
        nargs="+",
        default=["-"],
        help="input files to filter/encode",
    )
    parser.add_argument(
        "--outputs",
        nargs="+",
        default=["-"],
        help="path to save encoded outputs",
    )
    parser.add_argument(
        "--keep-empty",
        action="store_true",
        help="keep empty lines",
    )
    parser.add_argument("--workers", type=int, default=20)
    args = parser.parse_args()

    assert len(args.inputs) == len(
        args.outputs
    ), "number of input and output paths should match"

    with contextlib.ExitStack() as stack:
        inputs = [
            stack.enter_context(open(input, "r", encoding="utf-8"))
            if input != "-"
            else sys.stdin
            for input in args.inputs
        ]
        outputs = [
            stack.enter_context(open(output, "w", encoding="utf-8"))
            if output != "-"
            else sys.stdout
            for output in args.outputs
        ]

        encoder = MultiprocessingEncoder(args)
        pool = Pool(args.workers, initializer=encoder.initializer)
        encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 100)

        stats = Counter()
        for i, (filt, enc_lines) in enumerate(encoded_lines, start=1):
            if filt == "PASS":
                for enc_line, output_h in zip(enc_lines, outputs):
                    print(enc_line, file=output_h)
            else:
                stats["num_filtered_" + filt] += 1
            if i % 10000 == 0:
                print("processed {} lines".format(i), file=sys.stderr)

        for k, v in stats.most_common():
            print("[{}] filtered {} lines".format(k, v), file=sys.stderr)
コード例 #53
0
def filter_picks(catalog, stations=None, channels=None, networks=None,
                 locations=None, top_n_picks=None, evaluation_mode='all'):
    """
    Filter events in the catalog based on a number of parameters.

    :param catalog: Catalog to filter.
    :type catalog: obspy.core.event.Catalog
    :param stations: List for stations to keep picks from.
    :type stations: list
    :param channels: List of channels to keep picks from.
    :type channels: list
    :param networks: List of networks to keep picks from.
    :type networks: list
    :param locations: List of location codes to use
    :type locations: list
    :param top_n_picks: Filter only the top N most used station-channel pairs.
    :type top_n_picks: int
    :param evaluation_mode:
        To select only manual or automatic picks, or use all (default).
    :type evaluation_mode: str


    :return:
        Filtered Catalog - if events are left with no picks, they are removed
        from the catalog.
    :rtype: obspy.core.event.Catalog

    .. note::
        Will filter first by station, then by channel, then by network, if
        using top_n_picks, this will be done last, after the other filters
        have been applied.

    .. note::
        Doesn't work in place on the catalog, your input catalog will be safe
        unless you overwrite it.

    .. note:: Doesn't expand wildcard characters.

    .. rubric:: Example

    >>> from obspy.clients.fdsn import Client
    >>> from eqcorrscan.utils.catalog_utils import filter_picks
    >>> from obspy import UTCDateTime
    >>> client = Client('NCEDC')
    >>> t1 = UTCDateTime(2004, 9, 28)
    >>> t2 = t1 + 86400
    >>> catalog = client.get_events(starttime=t1, endtime=t2, minmagnitude=3,
    ...                             minlatitude=35.7, maxlatitude=36.1,
    ...                             minlongitude=-120.6, maxlongitude=-120.2,
    ...                             includearrivals=True)
    >>> print(len(catalog))
    12
    >>> filtered_catalog = filter_picks(catalog, stations=['BMS', 'BAP',
    ...                                                    'PAG', 'PAN',
    ...                                                    'PBI', 'PKY',
    ...                                                    'YEG', 'WOF'])
    >>> print(len(filtered_catalog))
    12
    >>> stations = []
    >>> for event in filtered_catalog:
    ...     for pick in event.picks:
    ...         stations.append(pick.waveform_id.station_code)
    >>> print(sorted(list(set(stations))))
    ['BAP', 'BMS', 'PAG', 'PAN', 'PBI', 'PKY', 'WOF', 'YEG']
    """
    # Don't work in place on the catalog
    filtered_catalog = catalog.copy()

    if stations:
        for event in filtered_catalog:
            if len(event.picks) == 0:
                continue
            event.picks = [pick for pick in event.picks
                           if pick.waveform_id.station_code in stations]
    if channels:
        for event in filtered_catalog:
            if len(event.picks) == 0:
                continue
            event.picks = [pick for pick in event.picks
                           if pick.waveform_id.channel_code in channels]
    if networks:
        for event in filtered_catalog:
            if len(event.picks) == 0:
                continue
            event.picks = [pick for pick in event.picks
                           if pick.waveform_id.network_code in networks]
    if locations:
        for event in filtered_catalog:
            if len(event.picks) == 0:
                continue
            event.picks = [pick for pick in event.picks
                           if pick.waveform_id.location_code in locations]
    if evaluation_mode == 'manual':
        for event in filtered_catalog:
            event.picks = [pick for pick in event.picks
                           if pick.evaluation_mode == 'manual']
    elif evaluation_mode == 'automatic':
        for event in filtered_catalog:
            event.picks = [pick for pick in event.picks
                           if pick.evaluation_mode == 'automatic']
    elif evaluation_mode != 'all':
        warnings.warn('Unrecognised evaluation_mode: %s, using all picks' %
                      evaluation_mode)
    if top_n_picks:
        all_picks = []
        for event in filtered_catalog:
            all_picks += [(pick.waveform_id.station_code,
                           pick.waveform_id.channel_code)
                          for pick in event.picks]
        counted = Counter(all_picks).most_common()
        all_picks = []
        # Hack around sorting the counter object: Py 2 does it differently to 3
        for i in range(counted[0][1]):
            highest = [item[0] for item in counted
                       if item[1] >= counted[0][1] - i]
            # Sort them by alphabetical order in station
            highest = sorted(highest, key=lambda tup: tup[0])
            for stachan in highest:
                if stachan not in all_picks:
                    all_picks.append(stachan)
            if len(all_picks) > top_n_picks:
                all_picks = all_picks[0:top_n_picks]
                break
        for event in filtered_catalog:
            if len(event.picks) == 0:
                continue
            event.picks = [pick for pick in event.picks
                           if (pick.waveform_id.station_code,
                               pick.waveform_id.channel_code) in all_picks]
    # Remove events without picks
    tmp_catalog = Catalog()
    for event in filtered_catalog:
        if len(event.picks) > 0:
            tmp_catalog.append(event)

    return tmp_catalog
コード例 #54
0
df2[1]
df2[np.where(df2=='顶')]
df2[[1,2,3]]
df2.ix[[1,2,3]]

[1,2,3,4].__dir__()
'的' in ['的大丰','的']

test = [1,2,3,4,2,2,3,1,4,4,4]

################set key用法
print(max(set(test),key=test.count))

a = [1,4,2,3,2,3,4,2]
from collections import Counter
Counter(a).most_common(2)

import functools
product = functools.reduce((lambda x, y: x * y),  [1, 2, 3, 4])
import operator
operator.xor(60,13)

functools.reduce(operator.xor, [1,2,5,2,1,5,9,2])

t1 = [1,2,3]
t2 =[10,20,30]
dict(zip(t1,t2))[3]

la = [1,2]
lb = [4,5,6]
lc = [7,8,9,10]
コード例 #55
0
ファイル: prepare_data.py プロジェクト: unixunion/nmt_project
def prepare():
    global vocab, written_lines

    # Files to be prepared
    files = {
        '{}.{}'.format(hparams['train_prefix'].replace('.bpe', ''), hparams['src']).replace(
            preprocessing['train_folder'], '').lstrip('\\/'): {'amount': 1, 'up_to': -1},
        # copy all of data (up to "samples")
        '{}.{}'.format(hparams['dev_prefix'].replace('.bpe', ''), hparams['src']).replace(preprocessing['train_folder'],
                                                                                          '').lstrip('\\/'): {
            'amount': .1, 'up_to': preprocessing['test_size']},  # copy 1/10th but up to 'test_size'
        '{}.{}'.format(hparams['test_prefix'].replace('.bpe', ''), hparams['src']).replace(
            preprocessing['train_folder'], '').lstrip('\\/'): {'amount': .1, 'up_to': preprocessing['test_size']},
        '{}.{}'.format(hparams['train_prefix'].replace('.bpe', ''), hparams['tgt']).replace(
            preprocessing['train_folder'], '').lstrip('\\/'): {'amount': 1, 'up_to': -1},
        '{}.{}'.format(hparams['dev_prefix'].replace('.bpe', ''), hparams['tgt']).replace(preprocessing['train_folder'],
                                                                                          '').lstrip('\\/'): {
            'amount': .1, 'up_to': preprocessing['test_size']},
        '{}.{}'.format(hparams['test_prefix'].replace('.bpe', ''), hparams['tgt']).replace(
            preprocessing['train_folder'], '').lstrip('\\/'): {'amount': .1, 'up_to': preprocessing['test_size']},
    }

    # pprint.pformat(files, indent=4)

    print(colorama.Fore.GREEN + "\nPreparing training set from raw set" + colorama.Fore.RESET)

    # Ensure that train folder exists
    try:
        os.makedirs(preprocessing['train_folder'])
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    # Ensure that model/log folder exists
    train_log_dir = os.path.join(hparams['out_dir'], 'train_log')
    try:
        os.makedirs(train_log_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    data_vocab = Counter()

    # Iterate thru files and prepare them
    for file_name, amounts in files.items():

        vocab = Counter()

        print("File: {}{}{}".format(colorama.Fore.GREEN, file_name, colorama.Fore.RESET))

        # Output file handler
        out_file = open('{}/{}'.format(preprocessing['train_folder'], file_name), 'w', encoding='utf-8',
                        buffering=131072)

        # Maximum number of lines
        read = 0
        amount = int(min(amounts['amount'] * preprocessing['samples'] if preprocessing['samples'] > 0 else 10 ** 20,
                         amounts['up_to'] if amounts['up_to'] > 0 else 10 ** 20))

        # Prepare thread variables
        write_thread = None
        vocab_thread = None
        written_lines = 0

        # We are going to use multiprocessing for tokenization, as it's cpu intensive
        with Pool(processes=preprocessing['cpu_count']) as pool:

            # Count number of lines in file
            progress = tqdm(ascii=True, unit=' lines', total=min(amount, sum(1 for _ in open(
                '{}/{}'.format(preprocessing['source_folder'], file_name), 'r', encoding='utf-8', buffering=131072))))

            # Open input file
            with open('{}/{}'.format(preprocessing['source_folder'], file_name), 'r', encoding='utf-8',
                      buffering=131072) as in_file:

                last_batch = False

                # Iterate every 10k lines
                for rows in read_lines(in_file, 10000, ''):

                    # If number of lines is greater than limit - break
                    read += len(rows)
                    if read >= amount:
                        rows = rows[:amount - read + len(rows)]
                        last_batch = True

                    # Process using multiprocessing
                    rows = pool.map(tokenize, rows, 100)

                    # Process vocab using multiprocessing
                    vocab_part = pool.map(sentence_split, rows, 100)

                    # Join running threads from previous loop
                    if write_thread is not None:
                        write_thread.join()
                        vocab_thread.join()
                        progress.update(written_lines)

                    # Thread for vocab update
                    vocab_thread = Thread(target=append_vocab, args=(vocab_part,))
                    vocab_thread.start()

                    # And thread for saving tokenized data to output file
                    write_thread = Thread(target=write_lines, args=(out_file, rows, written_lines == 0))
                    write_thread.start()

                    # Last batch - break / exit loop
                    if last_batch:
                        break

                # Join running threads and update progress bar
                write_thread.join()
                vocab_thread.join()
                progress.update(written_lines)
                progress.close()

        # If it's train file, save vocab
        if file_name == '{}.{}'.format(hparams['train_prefix'].replace('.bpe', ''), hparams['src']).replace(
                preprocessing['train_folder'], '').lstrip('\\/'):
            data_vocab[hparams['src']] = vocab
        elif file_name == '{}.{}'.format(hparams['train_prefix'].replace('.bpe', ''), hparams['tgt']).replace(
                preprocessing['train_folder'], '').lstrip('\\/'):
            data_vocab[hparams['tgt']] = vocab

    # If joined vocab - add counters
    if preprocessing['joined_vocab']:
        data_vocab[hparams['src']] += data_vocab[hparams['tgt']]
        del data_vocab[hparams['tgt']]

    # BPE/WPM-like tokenization
    # inspired by and based on https://github.com/rsennrich/subword-nmt
    if preprocessing['use_bpe']:

        print(colorama.Fore.GREEN + "\nLearning BPE" + colorama.Fore.RESET)

        # List of subword joins to be applied to training data
        joins = {}

        # Final train vocab for NMT
        train_vocab = {}

        # Learn BPE for both vocabs (or common vocab)
        for source, raw_vocab in data_vocab.items():

            # Pair stats
            stats = Counter()

            # Pair indexes
            indices = defaultdict(lambda: defaultdict(int))

            # Build 'new' vocab used for BPE learning (train_vocab will be a final vocab for NMT)
            vocab = []
            train_vocab[source] = Counter()

            # Build vocab for BPE learning purpose
            print("Building temporary vocab ({})".format(hparams['src'] if preprocessing['joined_vocab'] else source))
            for i, (entity, freq) in tqdm(enumerate(raw_vocab.most_common()), ascii=True, unit=' tokens'):

                # Split vocab token
                entity = tuple(entity.split())

                # Make pairs ("ABCD" -> (A, B), (B, C), (C, D)), stats, indexes and train vocab
                prev_char = entity[0]
                train_vocab[source][prev_char] += freq
                for char in entity[1:]:
                    stats[prev_char, char] += freq
                    indices[prev_char, char][i] += 1
                    train_vocab[source][char] += freq
                    prev_char = char
                vocab.append((entity, freq))

            print("Learning BPE for vocab of {} tokens".format(preprocessing['vocab_size']))

            # List of joins per vocab
            joins[source] = []

            # Partial stats speeds up learning process - optimization for 'max' above
            partial_stats = Counter(['', -1])
            partial_stats_min = 0
            update_partial_stats = True

            # Current number of vocab tokens
            train_vocab_len = prev_train_vocab_len = len(train_vocab[source])

            # Progress bar
            progress = tqdm(ascii=True, unit=' tokens', total=preprocessing['vocab_size'], maxinterval=0.1, miniters=10)
            progress.monitor_interval = 1
            progress.update(prev_train_vocab_len)

            # Learn until vocab will contain desired number of tokens
            while train_vocab_len < preprocessing['vocab_size']:

                clean_train_vocab = False

                # Get most frequent pair
                most_frequent, freq = partial_stats.most_common(1)[0]

                # Update partial stats or frequency of most frequent pair is less than saved minimum for partial stats
                if update_partial_stats or freq <= partial_stats_min:
                    partial_stats_min = partial_stats.most_common(500)[-1][1]
                    partial_stats = Counter()
                    for k, v in stats.most_common():
                        if v < partial_stats_min:
                            break
                        partial_stats[k] = v
                    update_partial_stats = False

                    # Get most frequent pair (again, proper one this time)
                    most_frequent, _ = partial_stats.most_common(1)[0]

                # If frequency is lower than 2 - exit
                if stats[most_frequent] < 2:
                    print(
                        'No pair has frequency greater than 1. Stopping earlier, your vocab file will include less tokens.\n')
                    break

                # Replace pair "A B" with new entity "AB"

                # Changes made
                changes = []

                # Replace regex
                pattern = re.compile(r'(?<!\S)' + re.escape(' '.join(most_frequent)) + r'(?!\S)')

                # Loop through indices
                for j, freq in indices[most_frequent].items():

                    # Do not touch not existent pairs
                    if freq < 1:
                        continue

                    # Get entity and frequency
                    entity, freq = vocab[j]

                    # Replace "A B" with "AB" in entity
                    new_entity = pattern.sub(''.join(most_frequent), ' '.join(entity))
                    new_entity = tuple(new_entity.split())

                    # Update entity
                    vocab[j] = (new_entity, freq)

                    changes.append((j, new_entity, entity, freq))

                # Update indices and pair stats
                # Merged pair doesn't exist anymore
                stats[most_frequent] = 0
                partial_stats[most_frequent] = 0
                indices[most_frequent] = defaultdict(int)

                # Get entities and a new pair
                first, second = most_frequent
                new_pair = first + second

                # Iterate through all changes
                for j, entity, old_entity, freq in changes:

                    # Find all occurences of first pair entity
                    prev = -2
                    for i in iter([i for i, entity in enumerate(old_entity) if entity == first]):

                        # Do not touch second "B B" if "B B B"
                        if i == prev + 1:
                            continue

                        # Check if second pair entity follows first one
                        if i < len(old_entity) - 1 and old_entity[i + 1] == second:

                            # Reduce frequency of "A B" in "A B C D" where "B C" is a merged pair
                            if i:
                                prev = old_entity[i - 1:i + 1]
                                stats[prev] -= freq
                                partial_stats[prev] = stats[prev]
                                indices[prev][j] -= 1

                            # Reduce frequency of "C D" in "A B C D" where "B C" is a merged pair
                            if i < len(old_entity) - 2:

                                # But do not touch "C B" if "A B C B C" as values will be adjusted with next occurence of "B C" pair
                                if old_entity[i + 2] != first or i >= len(old_entity) - 3 or old_entity[
                                    i + 3] != second:
                                    next = old_entity[i + 1:i + 3]
                                    stats[next] -= freq
                                    partial_stats[next] = stats[next]
                                    indices[next][j] -= 1

                            prev = i

                            if train_vocab[source][first] <= freq or train_vocab[source][second] <= freq:
                                clean_train_vocab = True
                            train_vocab[source][first] -= freq
                            train_vocab[source][second] -= freq

                    # Find all occurences of first pair entity
                    for i in [i for i, entity in enumerate(entity) if entity == new_pair]:

                        # Increase frequency of (new pair) "A BC" in "A BC D"
                        if i:
                            prev = entity[i - 1:i + 1]
                            stats[prev] += freq
                            if stats[prev] > partial_stats_min:
                                update_partial_stats = True
                            indices[prev][j] += 1

                        # Increase frequency of (new pair) "BC D" in "A BC D", but do not touch if "A BC BC" as stats for "BC BC" will be adjusted win next occurence of "BC" pair
                        if i < len(entity) - 1 and entity[i + 1] != new_pair:
                            next = entity[i:i + 2]
                            stats[next] += freq
                            if stats[next] > partial_stats_min:
                                update_partial_stats = True
                            indices[next][j] += 1

                        # Set frequency of a new pair
                        train_vocab[source][new_pair] += freq

                # Current pair is merged - is not a pair anymore, so has frequency of 0
                stats[most_frequent] = 0
                partial_stats[most_frequent] = 0

                # Remove (from training vocab) tokens with frequency of 0
                if clean_train_vocab:
                    train_vocab[source] = +train_vocab[source]

                # Calculate current number of train vocab entities
                prev_train_vocab_len = train_vocab_len
                train_vocab_len = len(train_vocab[source])
                train_vocab_len_diff = train_vocab_len - prev_train_vocab_len

                # Update progress bar
                if train_vocab_len_diff >= 0:
                    progress.update(train_vocab_len_diff)

                # For a negative number set new value directly - tqdm doesn't support negative updates
                else:
                    progress.n += train_vocab_len_diff
                    progress.refresh()

                # Add new join pair
                joins[source].append(most_frequent)

            # Save list of joins for train vocab
            joins[source] = dict(reversed([(v, i) for i, v in enumerate(joins[source])]))

            # Done
            progress.close()

        # Save list of joins to a file (joined vocab) and replace main vocabs
        if preprocessing['joined_vocab']:
            with open('{}/{}'.format(preprocessing['train_folder'], 'bpe_joins.common.json'), 'w', encoding='utf-8',
                      buffering=131072) as bpe_file:
                json.dump({json.dumps(k): v for k, v in joins[hparams['src']].items()}, bpe_file)
            data_vocab[hparams['src']] = train_vocab[hparams['src']]

        # Save list of joins to files (separated vocab)
        else:
            with open('{}/{}'.format(preprocessing['train_folder'], 'bpe_joins.{}.json'.format(hparams['src'])), 'w',
                      encoding='utf-8', buffering=131072) as bpe_file:
                json.dump({json.dumps(k): v for k, v in joins[hparams['src']].items()}, bpe_file)
            with open('{}/{}'.format(preprocessing['train_folder'], 'bpe_joins.{}.json'.format(hparams['tgt'])), 'w',
                      encoding='utf-8', buffering=131072) as bpe_file:
                json.dump({json.dumps(k): v for k, v in joins[hparams['tgt']].items()}, bpe_file)
            data_vocab[hparams['src']] = train_vocab[hparams['src']]
            data_vocab[hparams['tgt']] = train_vocab[hparams['tgt']]

        print(colorama.Fore.GREEN + "\nApplying BPE" + colorama.Fore.RESET)

        # BPE files to be prepared
        bpe_files = [
            '{}.{}'.format(hparams['train_prefix'], hparams['src']).replace(preprocessing['train_folder'], '').lstrip(
                '\\/'),
            '{}.{}'.format(hparams['dev_prefix'], hparams['src']).replace(preprocessing['train_folder'], '').lstrip(
                '\\/'),
            '{}.{}'.format(hparams['test_prefix'], hparams['src']).replace(preprocessing['train_folder'], '').lstrip(
                '\\/'),
            '{}.{}'.format(hparams['train_prefix'], hparams['tgt']).replace(preprocessing['train_folder'], '').lstrip(
                '\\/'),
            '{}.{}'.format(hparams['dev_prefix'], hparams['tgt']).replace(preprocessing['train_folder'], '').lstrip(
                '\\/'),
            '{}.{}'.format(hparams['test_prefix'], hparams['tgt']).replace(preprocessing['train_folder'], '').lstrip(
                '\\/'),
        ]

        # Iterate thru files and apply BPE
        for i, file_name in enumerate(bpe_files):

            # Current train vocab
            source = hparams['src'] if preprocessing['joined_vocab'] else file_name.split('.')[-1]

            print("File: {}{}{}".format(colorama.Fore.GREEN, file_name, colorama.Fore.RESET))

            # Output file handler
            out_file = open('{}/{}'.format(preprocessing['train_folder'], file_name), 'w', encoding='utf-8',
                            buffering=131072)

            # Prepare thread variables
            write_thread = None
            written_lines = 0

            # We are going to use multiprocessing for joins, as it's cpu intensive
            with Pool(processes=preprocessing['cpu_count'], initializer=apply_bpe_init,
                      initargs=(joins[source],)) as pool:

                # Progress bar
                progress = tqdm(ascii=True, unit=' lines', total=sum(1 for _ in open(
                    '{}/{}'.format(preprocessing['train_folder'], file_name.replace('.bpe.', '.')), 'r',
                    encoding='utf-8', buffering=131072)))

                # Open input file
                with open('{}/{}'.format(preprocessing['train_folder'], file_name.replace('.bpe.', '.')), 'r',
                          encoding='utf-8', buffering=131072) as in_file:

                    # Iterate every 10k lines
                    for rows in read_lines(in_file, 10000, ''):

                        # Process using multiprocessing
                        rows = pool.map(apply_bpe, rows, 100)

                        # Join running threads from previous loop
                        if write_thread is not None:
                            write_thread.join()
                            # vocab_thread.join()
                            # print('+')
                            progress.update(written_lines)
                            # vocab_thread2.join()

                        # Thread for saving tokenized data to output BPE file
                        write_thread = Thread(target=write_lines, args=(out_file, rows, written_lines == 0))
                        write_thread.start()

                    # Join running threads and update progress bar
                    write_thread.join()
                    progress.update(written_lines)
                    progress.close()

            # Remove unnecessary train file (BPE one will be used by NMT)
            os.remove('{}/{}'.format(preprocessing['train_folder'], file_name.replace('.bpe.', '.')))

    print(colorama.Fore.GREEN + "\nPostprocessing and saving vocabs" + colorama.Fore.RESET)

    # Vocab files to be prepared
    # Joined vocab
    if preprocessing['joined_vocab']:
        vocab_files = [
            '{}.{}'.format(hparams['train_prefix'].replace('train', 'vocab'), hparams['src']).replace(
                preprocessing['train_folder'], '').lstrip('\\/'),
        ]

    # Separated vocabs
    else:
        vocab_files = [
            '{}.{}'.format(hparams['train_prefix'].replace('train', 'vocab'), hparams['src']).replace(
                preprocessing['train_folder'], '').lstrip('\\/'),
            '{}.{}'.format(hparams['train_prefix'].replace('train', 'vocab'), hparams['tgt']).replace(
                preprocessing['train_folder'], '').lstrip('\\/'),
        ]

    for vocab_file_name in vocab_files:
        print("File: {}{}{}".format(colorama.Fore.GREEN, vocab_file_name, colorama.Fore.RESET))

        # Get most common entities
        source = vocab_file_name.split('.')[-1]
        data_vocab[source] = [entity for entity, _ in data_vocab[source].most_common()]

        # Write entities to a file
        with open('{}/{}'.format(preprocessing['train_folder'], vocab_file_name), 'w', encoding='utf-8',
                  buffering=131072) as vocab_file:
            vocab_file.write("<unk>\n<s>\n</s>\n" + "\n".join(data_vocab[source][:preprocessing['vocab_size']]))
        with open('{}/{}'.format(preprocessing['train_folder'], vocab_file_name.replace('vocab', 'vocab_unused')), 'w',
                  encoding='utf-8', buffering=131072) as vocab_file:
            vocab_file.write("\n".join(data_vocab[source][preprocessing['vocab_size']:]))

    print(colorama.Fore.GREEN + "\nWriting pbtxt file" + colorama.Fore.RESET)

    # Write pbtxt file for metadata for embeddings
    with open('{}/{}'.format(os.path.join(train_log_dir), 'projector_config.pbtxt'), 'w', encoding='utf-8',
              buffering=131072) as pbtxt_file:
        pbtxt_file.write(('''embeddings {{\n    tensor_name: 'embeddings/decoder/embedding_decoder'\n    ''' +
                          '''metadata_path: '{}'\n}}\nembeddings {{\n    ''' +
                          '''tensor_name: 'embeddings/encoder/embedding_encoder'\n    metadata_path: '{}'\n}}''').format(
            '{}/{}'.format(preprocessing['train_folder'], vocab_files[0].replace('train', 'vocab')),
            '{}/{}'.format(preprocessing['train_folder'],
                           vocab_files[0 if preprocessing['joined_vocab'] else 1].replace('train', 'vocab'))
        ))

    print(colorama.Fore.GREEN + "\nAll done" + colorama.Fore.RESET)
コード例 #56
0
from collections import ChainMap
import os, argparse

# 构造缺省参数:
defaults = {
    'color': 'red',
    'user': '******'
}

# 构造命令行参数:
parser = argparse.ArgumentParser()
parser.add_argument('-u', '--user')
parser.add_argument('-c', '--color')
namespace = parser.parse_args()
command_line_args = {k: v for k, v in vars(namespace).items() if v}

# 组合成ChainMap:
combined = ChainMap(command_line_args, os.environ, defaults)

# 打印参数:
print('color=%s' % combined['color'])
print('user=%s' % combined['user'])

# Counter
from collections import Counter

c = Counter()
for ch in 'programming':
    c[ch] = c[ch] + 1
print(c)
コード例 #57
0
from collections import Counter
if __name__ == "__main__":
    s = input().strip()
    b = []
    a = Counter(s)
    for i in a.keys():
        b.append([i, a[i]])
    b = sorted(b, key=lambda x: (-int(x[1]), -ord(x[0])))[:3]
    for i in b:
        print(*i)
コード例 #58
0
from collections import Counter
import pandas as pd
import time

# data = "Este es un ejemplo de remocion de palabras a ver que pasa con las palabras y la remocion"
articles = pd.read_csv("articles1.csv", usecols=[1, 2, 9])
ids = articles['id'].values
titles = articles['title'].values
content = articles['content'].values
stopWords = set(stopwords.words('english'))
index = {}
sub_index = {}
start = time.time()
for r in range(len(ids)):
    data = titles[r].lower() + content[r].lower()
    words = word_tokenize(data)
    wordsFiltered = []

    for w in words:
        if w not in stopWords and len(w) != 1:
            wordsFiltered.append(w)
    c = Counter(wordsFiltered)
    for s, r in c.items():
        sub_index[s] = r
    index[ids[r]] = sub_index
    sub_index = {}
    data = ""
end = time.time()
print(index[17284])
print(end - start)
コード例 #59
0
ファイル: Clustering.py プロジェクト: shkr/tweet-event
	def entropy(self,word):

		if self.labels==None:
			raise ValueError('Please execute self.build_clusters() before calculating entropy(word)')

		word_labels = [ self.labels[i] for i,text in enumerate(self.Snap['TEXT']) if word in text]
		WordLabelDistribution = Counter(word_labels)
		print WordLabelDistribution
		NoOfText = sum(WordLabelDistribution.values())
		WordEntropy = sum([ -(nlabelText/NoOfText)*log(nlabelText/NoOfText) for nlabelText in Counter.values() ])

		return WordEntropy
コード例 #60
0
def viterbi_segment(text):
    probs, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
                        for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]

dictionary = Counter(words(open(r'C:\Users\Comete\big.txt').read()))
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))


# 1. data processing
minutes.pop(minutes.columns[0])
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])

stop_words = stopwords.words('english')

import datetime
Month = [datetime.date(2008, i, 1).strftime('%B').lower() for i in range(1,13)]
stop_words.extend(['year','month','day','mr','meeting','committee','ms','federal','page']