コード例 #1
0
def Sikora(file_list):

	for infile in file_list:
		output = infile.split('.')
		leader_output = output[0] + '_leader_tenmer.txt'
		background_output = output[0] + '_background_tenmer.txt'
		
		background_list = []
		flu_list = []


		with open(infile, 'r') as inF:
			for line in inF:
				if '>' in line:
					pass
				elif 'GCAAAAGCAGG' in line:
					flu_list.append(line[:10])
				else:					
					background_list.append(line[:10])

		print (background_list)

		o1 = open(leader_output, 'w')
		counted = Counter(flu_list)
		newv = Counter.most_common(counted)
		for x in newv:
			o1.write(x[0] + '\t' + str(x[1]) +  '\n')

		o = open(background_output, 'w')
		counted = Counter(background_list)
		newv = Counter.most_common(counted)
		for x in newv:
			o.write(x[0] + '\t' + str(x[1]) +  '\n')
コード例 #2
0
def filter_alt_calls(alt_phased: collections.Counter, threshold: float):
    """
    Filter the counter alt-phased
    """
    total_per_pos = Counter()
    for (phasedchrom, phased_pos, phased_base),obs in alt_phased.most_common():
        total_per_pos[(phasedchrom, phased_pos)] += obs

    return [(phasedchrom, phased_pos, phased_base)
    for (phasedchrom, phased_pos, phased_base),obs in alt_phased.most_common()
     if obs/total_per_pos[(phasedchrom, phased_pos)] >= threshold
    ]
コード例 #3
0
def detect_flush(board):
    board_suits = [board[i].suit for i in range(len(board))]
    occurence_count = Counter(board_suits)
    suit_occur = Counter.most_common(occurence_count)[0][1]
    if suit_occur < 5:
        return False
    else:
        suit = Counter.most_common(occurence_count)[0][0]
        suited_vals = sorted([board[i].value for i in range(len(board)) if board[i].suit == suit], reverse = True)
        if all(x in suited_vals for x in [i for i in range(min(suited_vals),max(suited_vals)-1)]):
            return "Straight Flush", max(suited_vals)
            # Return Straight Flush and high end value value
        else:
            return "Flush", suited_vals[0], suited_vals[1], suited_vals[2], suited_vals[3], suited_vals[4]
コード例 #4
0
def mostOccuringWords():
    global chat, bigString
    split_it = bigString.split()
    numWords = len(split_it)
    Counter = Counter(split_it)
    most_occur = Counter.most_common(20)
    print(most_occur)
def Locate_Leaders(filename):
    bamfiles_list = []
    bamfiles_list2 = []
    with open(filename, 'r') as inF:
        for line in inF:
            if 'VHE' in line:
                line = line.split('\t')
                bamfiles_list.append(line[9])
                bamfiles_list2.append([line[9], line[2], line[3]])
    #print("--- %s seconds ---" % (time.time() - start_time))
    #6.47017002106 seconds
    #namefile
    #in_file = os.path.abspath(filename)
    donor = (re.split('%20|%3a|%29', filename))[8]
    sample = (re.split('%20|%3a|%29', filename))[10]
    output = ('%s_%s_leader.tsv') % (sample, donor)
    #if os.path.isfile(output) == False:
    #	print ('%s_%s_leader.tsv has been created')%(sample,donor)
    o = open(output, 'ab+')
    for i in range(0, len(leaders[1000:5000])):
        if len(leaders[i]) > 8:
            value = list(
                map(lambda x: str(x).startswith(leaders[i]), (bamfiles_list)))
            indexes = ([y for y, j in zip(count(), value) if j == True])
            location = list(
                map(
                    lambda x: str((bamfiles_list2[x])[1]) + ' ' + str(
                        (bamfiles_list2[x])[2]), indexes))
            total = str(len(location))
            location = Counter(location)
            location = Counter.most_common(location)
            o.write(str(leaders[i]) + '\t' + total)
            for l in location:
                o.write('\t' + str(l[0]) + '-' + str(l[1]))
            o.write('\n')
コード例 #6
0
ファイル: data.py プロジェクト: zidarsk8/dataMining
def addFakeData(oData,oLabels,count=100,low=10):
	data = oData[:]
	labels = oLabels[:]
	for iafsa in range(count):
		c = Counter(chain(*labels))
		lc = Counter.most_common(c)
	
		dlc = {}
		for l in lc: dlc[l[0]] = l[1]
	
		#teze = [sum([ dlc[y]**2 for y in x])  for x in labels]
		teze = [sum([ dlc[y] for y in x])  for x in labels]
		teze = sorted([(y,x) for x,y in enumerate(teze)])
		tt = teze[:max(low*10,200)]
		shuffle(tt)
		duplicate = [x[1] for x in tt[:low]]
		dLabels = [labels[i][:] for i in duplicate]
		dData = [data[i][:] for i in duplicate]
		for ii in range(1):
			for i in range(len(duplicate)):
				labels.append(dLabels[i])
				data.append(dData[i])
	#shuflamo vrstice da niso vec lepo, pa poskrbimo da labele ostanejo 
	#pri svojem primeru
	sd = []
	[sd.append((data[i],labels[i])) for i in xrange(len(data))]
	shuffle(sd)
	ll = []
	dd = []
	for x,y in sd:
		dd.append(x)
		ll.append(y)
	return (dd, ll)
 def topKFrequent(self, nums, k):
     lis = []
     nums_ctr = Counter.most_common(Counter(nums))
     for i in range(k):
         lis.append(nums_ctr[i][0])
     return lis
     """
コード例 #8
0
    def calculate(cls,
                  model,
                  aspects: collections.Counter,
                  min_counts: int = -1):
        """Constructs word distance matrix from aspects counter."""
        words = []
        not_in_model = []
        n_smaller_count = 0
        for (word, counts) in aspects.most_common():
            if counts <= 0:
                raise ValueError("Word {} has {} <= 0 value in the counter. "
                                 "Distance matrix requires positive values "
                                 "only.".format(word, counts))
            if counts > min_counts:
                if word in model:
                    words.append(word)
                else:
                    not_in_model.append(word)
            else:
                n_smaller_count += 1

        print("{} valid words not in the Word2Vec model.".format(
            len(not_in_model)))
        print("{} words have count < {} and are ignored.".format(
            n_smaller_count, min_counts))
        print("Calculating matrix with {} words.".format(len(words)))
        matrix = np.eye(len(words))
        for i, word in enumerate(words):
            matrix[i, i:] = model.distances(word, words[i:])

        return cls(words, matrix)
コード例 #9
0
def get_wordcloud_words():
    top_headlines = newsapi.get_top_headlines(language='en')
    query = ''
    articles = top_headlines['articles']
    for i in range(0, len(articles)):
        query += articles[i]['title'].lower() + " "

    #Removing stop words
    stopwordsfile = open('stopwords_en.txt', 'r')
    stopwords = [line.split('\n')[0] for line in stopwordsfile.readlines()]

    querywords = query.split()
    resultwords = [
        word for word in querywords if word.lower() not in stopwords
    ]
    result = ' '.join(resultwords)

    from collections import Counter
    split_it = result.split()
    Counter = Counter(split_it)
    most_occur = Counter.most_common(30)  #Get 30 most frequent words

    most_frequent_results = []
    for i in range(0, len(most_occur)):
        mydict = {}
        mydict['size'] = str(most_occur[i][1])
        mydict['word'] = str(most_occur[i][0])
        most_frequent_results.append(mydict)

    return jsonify({'most_frequent_results': most_frequent_results})
コード例 #10
0
    def _create_vocab(self, counter: collections.Counter, min_freq: int, max_size: Optional[int],
                      unk_token: Optional[str], pad_token: Optional[str], special_tokens: List) -> Tuple[Dict[str, int],
                                                                                                         List[str]]:
        """
        Handle the actual vocabulary creation.
        Tokens that appear less than min_freq times are ignored
        Once the vocabulary reaches max size, no more tokens are added
        `unk_token` is the token used to replace tokens not in the vocabulary
        `pad_token` is used to pad sequences
        `special_tokens` are other tokens we want appended to the start of our vocabulary, i.e. start of sequence tokens
        """
        stoi = dict()

        if unk_token is not None:
            stoi[unk_token] = len(stoi)
        if pad_token is not None:
            stoi[pad_token] = len(stoi)
        for special_token in special_tokens:
            stoi[special_token] = len(stoi)

        max_size = max_size - len(stoi)

        for token, count in counter.most_common(max_size):
            if count >= min_freq:
                if token not in stoi:
                    stoi[token] = len(stoi)
            else:
                break

        assert len(stoi) > 0, 'Created vocabulary is empty!'

        itos = list(stoi.keys())

        return stoi, itos
コード例 #11
0
def count_10_word_in_json():
    from collections import Counter
    with open("newsafr.json", encoding="utf-8") as datafile:
        json_data = json.load(datafile)
        json_items = json_data["rss"]["channel"]["items"]

        descriptions = []

        for i in json_items:
            descriptions.append(i["description"].split())

        format_description = []

        for elem in sum(descriptions, []):
            if len(elem) > 6:
                format_description.append(elem.lower())

        def sortByLength(inputStr):
            return len(inputStr)

        format_description.sort(key=sortByLength, reverse=True)

        Counter = Counter(format_description)
        words = Counter.most_common(10)
        for word in words:
            pprint(f"Слово: '{word[0]}' встречается: {word[1]} раз")
コード例 #12
0
def count_10_word_in_xml():
    from collections import Counter
    tree = ET.parse("newsafr.xml")
    descriptions = []
    root = tree.getroot()
    xml_items = root.findall("channel/item")

    for item in xml_items:
        description = item.find("description")
        descriptions += description.text.split()

    format_description = []
    for word in descriptions:
        if len(word) > 6:
            format_description.append(word.lower())

    def sortByLength(inputStr):
        return len(inputStr)

    format_description.sort(key=sortByLength, reverse=True)

    Counter = Counter(format_description)
    words = Counter.most_common(10)
    for word in words:
        print(f"Слово: '{word[0]}' встречается: {word[1]} раз")
コード例 #13
0
    def _initialize_data(self) -> None:
        """
        Loads the dataset and prepares for it to be generated
        """
        data = load_from_csv(self._filepath, self.name_col, self.lat_col,
                             self.lon_col, self.value_col)
        value_label = data.pop()
        self._names, self._lats, self._lons, self._values = data

        self._lat_max = max(
            self._lats) + self.border_offset + self.north_offset
        self._lat_min = min(
            self._lats) - self.border_offset - self.south_offset
        self._lon_max = max(self._lons) + self.border_offset + self.east_offset
        self._lon_min = min(self._lons) - self.border_offset - self.west_offset

        # assigning a number to each unique value provided, and map it to the points
        count = IterCounter(self._values)
        # value[0] because value from a enumerate(IterCounter) gives a
        # tuple of the name of the item and how many of that item
        # are contained within the count.
        if self._mode == MODES[0]:
            self._legend = {
                value[0]: i + 1 if i + 1 <= len(COLOURS) else len(COLOURS)
                for i, value in enumerate(count.most_common())
            }
            self._verboseprint(self._legend)

        elif self._mode == MODES[1]:
            self._legend = {value_label: 1}
            self._values = [float(v) for v in self._values]
コード例 #14
0
def Bartel(file_list):

	for infile in file_list:
		output = infile.split('.')
		leader_output = output[0] + 'leader.txt'
		tenmer_output = output[0] + 'tenmer.txt'
		leaders_list = []
		tenmers_list = []
		with open(infile, 'r') as inF:
			for line in inF:
				if 'ORIGINAL_SEQUENCE' in line:
					line_a = line.split('\t')
					place = line_a.index('ORIGINAL_SEQUENCE')
		with open(infile, 'r') as inF:
			for line in inF:
				leader = (line.split('\t'))[place]
				leaders_list.append(leader)

				if len(leader) >= 10:
					tenmers_list.append(leader[:10])

		o = open(leader_output, 'w+')
		o.write(str(len(leaders_list)) + '\n')
		for leader_seq in leaders_list:
			o.write(leader_seq + '\n')

		o1 = open(tenmer_output, 'w+')
		o1.write(str(len(tenmers_list)) + '\n')

		counted = Counter(tenmers_list)
		newv = Counter.most_common(counted)
		for x in newv:
			o1.write(x[0] + '\t' + str(x[1]) +  '\n')
コード例 #15
0
ファイル: main.py プロジェクト: eagle12td/dataMining
def wanabeknn(k=15):
	from collections import Counter
	ftrd = open("minidata/trainingData.csv")
	fted = open("minidata/testData.csv")
	flab = open("minidata/trainingLabels.csv")

	lab = [[int(j) for j in i.strip().split(",")]  for i in flab.readlines()]
	trd = [[int(j) for j in i.strip().split("\t")] for i in ftrd.readlines()]
	ted = [[int(j) for j in i.strip().split("\t")] for i in fted.readlines()]

	def dist(a,b): return sum([min(a[i], b[i]) for i in xrange(len(a))])

	rez = []
	for v in ted:
		print "hurej  %4d   %3d" % ( len(rez),len(rez[-1:]))
		t = []
		for trindex, train in enumerate(trd):
			t.append((dist(train, v), trindex))
		tt = sorted(t, reverse=True)
		ll = []
		for i in range(k): ll += lab[tt[i][1]]
		n = len(ll)
		for i in range(k/3): ll += lab[tt[i][1]]
		rez.append([x[0] for x in Counter.most_common(Counter(ll),n/k)])
		print rez
	cPickle.dump(rez, file("rezPickled/wnbknn%d.pickled" % k, "w"), -1)
コード例 #16
0
 def _most_preferred(self, alternatives):
     """Applies funcnamei from each trait to the alternatives and return the most preferred."""
     prefs = [y for y in [getattr(x, funcnamei)(alternatives) for x in self.traits] if y is not None]
     if not prefs:
         return None
     if len(prefs) == 1:
         return prefs[0]
     return Counter.most_common(Counter(prefs), 1)[0][0]
コード例 #17
0
 def _most_preferred(self, alternatives):
     """Applies funcnamei from each trait to the alternatives and return the most preferred."""
     prefs = [y for y in [getattr(x, funcnamei)(alternatives) for x in self.traits] if y is not None]
     if not prefs:
         return None
     if len(prefs) == 1:
         return prefs[0]
     return Counter.most_common(Counter(prefs), 1)[0][0]
コード例 #18
0
ファイル: metrics.py プロジェクト: markus-beuckelmann/pattern
 def items(self, relative=False):
     """ Returns a list of (key, value)-tuples sorted by value, highest-first.
         With relative=True, the sum of values is 1.0.
     """
     a = Counter.most_common(self)
     if relative:
         n = sum(v for k, v in a) or 1.
         a = [(k, v / n) for v, k in a]
     return a
コード例 #19
0
 def items(self, relative=False):
     """ Returns a list of (key, value)-tuples sorted by value, highest-first.
         With relative=True, the sum of values is 1.0.
     """
     a = Counter.most_common(self)
     if relative:
         n = sum(v for k, v in a) or 1.
         a = [(k, v / n) for v, k in a]
     return a
コード例 #20
0
    def reorganizeString(self, S):
        """
        :type S: str
        :rtype: str
        AAAB
        (3 - 1) * 2 + 1 <= 4
        ((most count - 1) * (N + 1)) + Counted value <= len(S), OK.
        ((most count - 1) * (N + 1)) + Counted value > len(S), Doomed.
        """
        from collections import Counter as CC
        c = CC(S)
        most_counted_char = max(c.values())

        total_cnt = (most_counted_char - 1) * 2 + \
            c.values().count(most_counted_char)

        if total_cnt > len(S):
            return ""
        """
        n list of chars.
        pop 1 by 1 from most to least.
        """
        # create list
        ll = []

        for i in c.most_common():
            ll.append([i[0]] * i[1])

        # [[2,2,2],[3,3],[1]]
        # [2,2] [3] [2] [3]

        # result:
        # 2 3 2 3 2 1

        result = ""
        # v_v_v_
        # vlv_v_
        # vlvov
        while True:
            flip = 0

            for l in ll:
                if l:
                    if flip < 2:
                        if result and result[-1] == l[-1]:
                            continue
                        result += l.pop()
                        flip += 1
                    else:
                        break

            if flip == 0:
                break

        return result
コード例 #21
0
    def mostCommonWord(self, paragraph: str, banned: List[str]) -> str:
        count = Counter(re.sub(r'[^\w]', ' ', paragraph.lower()).split())
        ban_set = set(banned)

        # return next(k for k in Counter.most_common(count) if not k[0][0] in ban_set)
        while True:
            most = Counter.most_common(count)[0][0]
            if most in ban_set:
                count.pop(most)
            else:
                return most
コード例 #22
0
ファイル: data.py プロジェクト: zidarsk8/dataMining
def removeLeastCommonData(oData, oLabels, least=5):
	data = oData[:]
	labels = oLabels[:]
	c = Counter(chain(*labels))
	lc = Counter.most_common(c)
	bb = sorted(list(Set([j for i,j in lc])))
	a = [x[0] for x in lc if x[1] < bb[5]]
	rem = [i for i,j in enumerate(labels) if len(Set(j).intersection(Set(a))) > 0 ]
	[labels.pop(x) for x in sorted(rem, reverse=True)]
	[data.pop(x) for x in sorted(rem, reverse=True)]
	return (data, labels)
コード例 #23
0
def Repeat2(input):
    words = input.lower().replace(',', ' ').replace('.',
                                                    ' ').replace('?',
                                                                 ' ').split()
    dict = Counter(words)
    print(dict)
    print('most common', Counter.most_common(dict)[0])
    for key in words:
        if dict[key] > 1:
            print(key, dict[key])
            return
コード例 #24
0
def main():
    global string_array
    global char_array
    global words
    global final_list
    global Counter
    global most_occur




    
    with open(args.filepath) as f:
        d = json.load(f)

    #spereate info of json file in more usable smaller parts
    profile_pics = d["profile_pictures"]
    contacts = d["contacts"]
    c_list = contacts["list"]
    chats = d["chats"]
    chat_list = chats["list"]
    

    #get all messages from name
    for item in chat_list:
    
        if len(item) == 4:
            if item["name"] == args.name:
            
                for i in range(len(item["messages"])):

                    string_array = np.append(string_array , item["messages"][i]["text"])
        

    
    #split and append to word list
    for item in string_array:
        if isinstance(item, str):

            split_it = item.split()
            words.append(split_it)



    #add every word to list
    for item in words:
    
        final_list = final_list + item
      
    #calculate most common word
    go = [x.upper() for x in final_list]
    Counter = Counter(go) 
    most_occur = Counter.most_common(10) #this number can be modfied. Right now only 10 most common words are displayed
    print(most_occur)
コード例 #25
0
 def getList(self, data): 
     from collections import Counter
     import pandas as pd
     n=20000
     split_it = data.split()
     Counter = Counter(split_it)
     most_occur = Counter.most_common(n)
     vocalList=[]
     for i in range(n):
         vocalList.append(most_occur[i][0])
     return vocalList
コード例 #26
0
ファイル: visualizer.py プロジェクト: xyz1396/qiime2
def most_common_viz(output_dir: str, ints: collections.Counter) -> None:
    df = pd.DataFrame(ints.most_common(), columns=["Integer", "Frequency"])

    with open(os.path.join(output_dir, 'index.html'), 'w') as fh:
        fh.write('<html><body>\n')
        fh.write('<h3>Most common integers:</h3>\n')
        fh.write(df.to_html(index=False))
        fh.write('</body></html>')

    with open(os.path.join(output_dir, 'index.tsv'), 'w') as fh:
        fh.write(df.to_csv(sep='\t', index=False))
コード例 #27
0
def count_code_samples(url):
    d = {}
    req = rq.get(url)
    source_code = req.text
    pattern = r'<code>(.*?)</code>'
    results = re.findall(pattern, source_code)
    for item in results:
        if item not in d:
            d[item] = 0
        d[item] += 1

    return Counter.most_common(d)
コード例 #28
0
def locate_leaders(filename):
    donor = (re.split('%20|%3a|%29', filename))[8]
    sample = (re.split('%20|%3a|%29', filename))[10]
    output = (output_path + '%s_%s_locations.tsv') % (sample, donor)
    #query_list
    query_file = ('fishers_output_%s_%s_filteredfdr.txt') % (sample, donor)
    query_file = query_file.replace('onor', '')
    print query_file
    print output
    if query_file in unfinished:
        query_file = output_path + query_file
        query_list = []
        with open(query_file, 'r') as inF:
            next(inF)
            for line in inF:
                query = (line.split('\t'))[0]
                query_list.append(query)
        filename = path + filename
        bamfiles_list = []
        bamfiles_list2 = []
        with open(filename, 'r') as inF:
            for line in inF:
                if 'VHE' in line:
                    line = line.split('\t')
                    if line[1] == '0':
                        bamfiles_list.append(line[9])
                        bamfiles_list2.append(
                            [line[9], line[2], (str(line[3]) + ',+')])
                    elif line[1] == '16':
                        bamfiles_list.append(line[9])
                        bamfiles_list2.append(
                            [line[9], line[2], (str(line[3]) + ',-')])
                    else:
                        pass
        o = open(output, 'ab+')
        for i in range(0, len(query_list)):
            value = list(
                map(lambda x: str(x).startswith(query_list[i]),
                    (bamfiles_list)))
            indexes = ([y for y, j in zip(count(), value) if j == True])
            location = list(
                map(
                    lambda x: str((bamfiles_list2[x])[1]) + ' ' + str(
                        (bamfiles_list2[x])[2]), indexes))
            total = str(len(location))
            o.write(str(query_list[i]) + '\t' + total)
            location = Counter(location)
            location = Counter.most_common(location)
            for l in location:
                o.write('\t' + str(l[0]) + ':' + str(l[1]))
            o.write('\n')
        return output
コード例 #29
0
def getKeyWords(Counter, num, country):
    
    # Gets the news data from the news api website
    news_url = ('https://newsapi.org/v2/top-headlines?'
           'country={}&'
           'apiKey=a8a1a5ea66c04f1488210e7b0016b948'.format(country.lower()))
    response = requests.get(news_url)

    # List of stop words that are to be removed from our list of words
    stopwords = ["I", "Me", "My", "Myself", "We", "Our", "Ours", "Ourselves", "You",
                 "Your", "Yours", "Yourself", "Yourselves", "He", "Him", "His",
                 "Himself", "She", "Her", "hers", "herself", "it", "its", "itself",
                 "they", "them", "their", "theirs", "themselves", "what", "which",
                 "who", "whom", "this", "that", "these", "those", "am", "is", "are",
                 "was", "were", "be", "been", "being", "have", "has", "had",
                 "having", "do", "does", "did", "doing", "a", "an", "the", "and",
                 "but", "if", "or", "because", "as", "until", "while", "of", "at",
                 "by", "for", "with", "about", "against", "between", "into",
                 "through", "during", "before", "after", "above", "below", "to",
                 "from", "up", "down", "in", "out", "on", "off", "over", "under",
                 "again", "further", "then", "once", "here", "there", "when",
                 "where", "why", "how", "all", "any", "both", "each", "few",
                 "more", "most", "other", "some", "such", "no", "nor", "not",
                 "only", "own", "same", "so", "than", "too", "very", "s", "t",
                 "can", "will", "just", "don", "should", "now", "-", "News", "The",
                 "|", "CNN", "CBS", "BBC", "Guardian", "says", "news", "Daily",
                 "Mail", "Online", "don't", "Mirror", "After", "NPR", "Washington",
                 "Post", "", "new", "Sky", "ITV", "could" , "suggests", "fears",
                 "live", "say"]

    # Variable to store the list of headlines
    words = ''
    # Store the headlines in the declared variable
    for i in response.json()['articles']:
        words +=  i['title'] + ' '

    # Splits the headlines into an array where each word is its own value
    words2 = words.split()
    # Removes special characters
    words2 = [re.sub('[^a-zA-Z]+', '', x) for x in words2]
    # Counts the occurence of each word
    Counter = Counter(words2)

    # Removes the words in the stop word array from the list
    for i in stopwords:
        del Counter[i[:1].upper() + i[1:]]
        del Counter[i[:1].lower() + i[1:]]
        del Counter[i.upper()]
        del Counter[i.lower()]
    # Returns list of most common words
    return Counter.most_common(num)
コード例 #30
0
def count_frequent_words():
    global Counter
    df = pd.read_csv('../../csv_files/scmp_article_content_all.csv')
    text_dataset = df['paragraphs']
    #print(type(text_dataset.head()))

    split_it = text_dataset.str.split()
    #print(split_it)

    #counter = Counter(split_it)
    Counter = Counter(x for xs in split_it for x in xs)
    most_occur = Counter.most_common(10)
    print(most_occur)
    return most_occur
コード例 #31
0
def output_most_common(directory):
    files = glob.glob(os.path.join(directory, "*.txt"))
    for file in files:
        file = strip_stop_words(file).split()

        # with open(file, 'w') as csv_file:
        #     writer = csv.writer(csv_file)
        #     writer.writerow(['Word', 'Frequency'])
        #     for key, value in hold_diffs.items():
        #         writer.writerow([key, value])

        # return file

        print(Counter=Counter.most_common(50))
コード例 #32
0
ファイル: data.py プロジェクト: zidarsk8/dataMining
def removeMostCommonData(oData, oLabels, count=20):
	data = oData[:]
	labels = oLabels[:]
	for iafsa in range(count):
		c = Counter(chain(*labels))
		lc = Counter.most_common(c)
		dlc = {}
		for l in lc: dlc[l[0]] = l[1]
		teze = [max([ dlc[y] for y in x])  for x in labels]
		teze = sorted([(y,x) for x,y in enumerate(teze)])
		rem = [x[1] for x in teze[-10:]]
		[labels.pop(x) for x in sorted(rem, reverse=True)]
		[data.pop(x) for x in sorted(rem, reverse=True)]
	return (data, labels)
コード例 #33
0
def write_token_vocab(words: collections.Counter, save_path: Text) -> None:
    """"Writes token vocabulary from @words to @save_path."""
    # Sort tokens by frequency and then lexically to break ties.
    words_with_counts = words.most_common()
    words_with_counts.sort(key=lambda x: (x[1], x[0]), reverse=True)
    vocab_path = os.path.join(save_path, 'vocab.cfq.tokens')

    with open(vocab_path, 'w') as f:
        # Tensor2tensor needs these additional tokens.
        f.write('<pad>\n<EOS>\n<OOV>\n')
        for word, _ in words_with_counts:
            f.write(f'{word}\n')
    logging.info(f'Token vocabulary written to {vocab_path} ({len(words)} '
                 'distinct tokens).')
コード例 #34
0
ファイル: FE595TextAnalysis.py プロジェクト: sbhakuni/FE-595
def Counter1(filename):
    with open(filename, 'r') as file:
        datastring = file.read().replace('\n', '')
    # split() returns list of all the words in the string 
        
    blob=TextBlob(datastring)
    qaz=blob.noun_phrases
    
    decriptors_series=[]
    for i in range(0,len(qaz)):    
        decriptors_series.append(qaz[i])         
    Counter = Counter(decriptors_series) 
    most_occur = Counter.most_common(10) 
    return most_occur 
コード例 #35
0
def comcap(instaid):
    idlink = 'https://www.instagram.com/{}/?__a=1'
    url = idlink.format(instaid)

    data = requests.get(url).json()
    if (data['graphql']['user']['is_private']):

        return 'oops its a private account'
    user = data['graphql']['user']['edge_owner_to_timeline_media']['edges']

    num = len(user)
    #print(num)

    allcap = ''

    for i in range(0, num):
        try:
            x = user[i]['node']['edge_media_to_caption']['edges'][0]['node'][
                'text']
            m = re.findall(r'[#]\w+', x)
            t = ''

            for j in m:
                t = t + j + ' '
            t = re.sub(r'\#', ' ', t)
            t = t.lower()
            allcap = allcap + t + ' '
        except:
            continue

    from collections import Counter
    data_set = allcap
    # split() returns list of all the words in the string
    split_it = data_set.split()

    # Pass the split_it list to instance of Counter class.
    Counter = Counter(split_it)
    #print(Counter)

    # most_common() produces k frequently encountered
    # input values and their respective counts.
    most_occur = Counter.most_common(8)
    txt = ''
    for i in most_occur:
        txt = txt + '#' + i[0] + ' '
    #print(txt)

    return txt
コード例 #36
0
def MostFrequent():
    from collections import Counter

    data_set = "Welcome to the world of Geeks " \
               "This portal has been created to provide well written well" \
               "thought and well explained solutions for selected questions " \
               "If you like Geeks for Geeks and would like to contribute " \
               "here is your chance You can write article and mail your article " \
               " to contribute at geeksforgeeks org See your article appearing on " \
               "the Geeks for Geeks main page and help thousands of other Geeks. " \

    split_it = data_set.split()

    Counter = Counter(split_it)

    most_occur = Counter.most_common(9)

    print(most_occur)
コード例 #37
0
 def most_common(self, n, conts):
     """Returns most frequent word"""
     return Counter.most_common(conts)
コード例 #38
0
ファイル: counter.py プロジェクト: aruntakkar/PyCode
from collections import Counter

text = "In February 2014, I made a recommendation to my co - founders at" \
    "Ballistiq that I wanted to cancel development of ArtStation." \
    "The project was in development hell. It wasn’t going anywhere." \
    "I was unhappy with it and just couldn’t see a path for it to be a"\
    "successful product. Two months later we managed to launch it," \
    "and two years later it is the leading network for professional games."

words = text.split()

Counter = Counter(words)

top_three = Counter.most_common(3)

print(top_three)
コード例 #39
0
ファイル: skp.py プロジェクト: Smotko/data-mining
from collections import Counter

hairy = open("RFand1R-2-2")
majcn = open("resultRF-0.72388.csv")
smotko = open("smotkoTopScore.csv")
zidar = open("result-0.36027.csv")
aaa = open("resultRF-0.85714.csv")

a = []
a.append([[int(j) for j in i.strip().split(" ")] for i in majcn.readlines()])
a.append([[int(j) for j in i.strip().split(" ")] for i in zidar.readlines()])
a.append([[int(j) for j in i.strip().split(" ")] for i in smotko.readlines()])
a.append([[int(j) for j in i.strip().split(" ")] for i in hairy.readlines()])
a.append([[int(j) for j in i.strip().split(" ")] for i in aaa.readlines()])
b = []
for i in xrange(len(a[0])):
	c = Counter(a[0][i]+a[1][i]+a[2][i]+a[3][i]+a[4][i])
	b.append([x[0] for x in Counter.most_common(c) if x[1]>1])
	if len(x)==0 :
		x = [40, 44, 18, 62, 41]
		print "aa"
for i in b:
	if len(i) == 1:
		print "aaa"
print "bb"
f = file("RFand1R-2-2","w")
f.write("\n".join(   [" ".join([str(x) for x in i]) for i in b ]    ))
f.flush()
f.close()
コード例 #40
0
ファイル: skp.py プロジェクト: eagle12td/dataMining
from collections import Counter

files = [open("knn.txt"), open("knnm.txt"), open("RF200por.txt"), open("test.txt")]

a = []
for f in files:
	a.append([[int(j) for j in i.strip().split(" ")] for i in f.readlines()])

b = []
for i in xrange(len(a[0])):
	c = Counter(a[0][i]+a[1][i]+a[2][i])
	x = [x[0] for x in Counter.most_common(c) if x[1]>2]
	if len(x)==0 :
		x = [40]
	b.append(x)

print "done"
f = file("skupniNoTest.csv","w")
f.write("\n".join(   [" ".join([str(x) for x in i]) for i in b ]    ))
f.close()
コード例 #41
0
ファイル: EmoCount.py プロジェクト: lolgans/twitchSentiment
# print Counter(foundEmosAndSmilies)
# print("found smilies top50")
# print Counter.most_common(Counter(foundEmosAndSmilies), 50)
# print("total number of sentences with smilies:")
# print numOfTotalSentences

"""
Classify
"""
for message in messages:
    messagecounter += 1
    # for every sentence
    score, words = emoCount.score(message)

    foundEmosAndSmilies = foundEmosAndSmilies + words
    if len(words) != 0:
        numOfTotalSentences += 1
        print(words)


# in the end
print("Messages total:")
print messagecounter
print("found smilies total:")
print sum(Counter(foundEmosAndSmilies).values())
print("found smilies examples ordered:")
print Counter(foundEmosAndSmilies)
print("found smilies top50")
print Counter.most_common(Counter(foundEmosAndSmilies), 50)
print("total number of sentences with smilies:")
print numOfTotalSentences