def Sikora(file_list): for infile in file_list: output = infile.split('.') leader_output = output[0] + '_leader_tenmer.txt' background_output = output[0] + '_background_tenmer.txt' background_list = [] flu_list = [] with open(infile, 'r') as inF: for line in inF: if '>' in line: pass elif 'GCAAAAGCAGG' in line: flu_list.append(line[:10]) else: background_list.append(line[:10]) print (background_list) o1 = open(leader_output, 'w') counted = Counter(flu_list) newv = Counter.most_common(counted) for x in newv: o1.write(x[0] + '\t' + str(x[1]) + '\n') o = open(background_output, 'w') counted = Counter(background_list) newv = Counter.most_common(counted) for x in newv: o.write(x[0] + '\t' + str(x[1]) + '\n')
def filter_alt_calls(alt_phased: collections.Counter, threshold: float): """ Filter the counter alt-phased """ total_per_pos = Counter() for (phasedchrom, phased_pos, phased_base),obs in alt_phased.most_common(): total_per_pos[(phasedchrom, phased_pos)] += obs return [(phasedchrom, phased_pos, phased_base) for (phasedchrom, phased_pos, phased_base),obs in alt_phased.most_common() if obs/total_per_pos[(phasedchrom, phased_pos)] >= threshold ]
def detect_flush(board): board_suits = [board[i].suit for i in range(len(board))] occurence_count = Counter(board_suits) suit_occur = Counter.most_common(occurence_count)[0][1] if suit_occur < 5: return False else: suit = Counter.most_common(occurence_count)[0][0] suited_vals = sorted([board[i].value for i in range(len(board)) if board[i].suit == suit], reverse = True) if all(x in suited_vals for x in [i for i in range(min(suited_vals),max(suited_vals)-1)]): return "Straight Flush", max(suited_vals) # Return Straight Flush and high end value value else: return "Flush", suited_vals[0], suited_vals[1], suited_vals[2], suited_vals[3], suited_vals[4]
def mostOccuringWords(): global chat, bigString split_it = bigString.split() numWords = len(split_it) Counter = Counter(split_it) most_occur = Counter.most_common(20) print(most_occur)
def Locate_Leaders(filename): bamfiles_list = [] bamfiles_list2 = [] with open(filename, 'r') as inF: for line in inF: if 'VHE' in line: line = line.split('\t') bamfiles_list.append(line[9]) bamfiles_list2.append([line[9], line[2], line[3]]) #print("--- %s seconds ---" % (time.time() - start_time)) #6.47017002106 seconds #namefile #in_file = os.path.abspath(filename) donor = (re.split('%20|%3a|%29', filename))[8] sample = (re.split('%20|%3a|%29', filename))[10] output = ('%s_%s_leader.tsv') % (sample, donor) #if os.path.isfile(output) == False: # print ('%s_%s_leader.tsv has been created')%(sample,donor) o = open(output, 'ab+') for i in range(0, len(leaders[1000:5000])): if len(leaders[i]) > 8: value = list( map(lambda x: str(x).startswith(leaders[i]), (bamfiles_list))) indexes = ([y for y, j in zip(count(), value) if j == True]) location = list( map( lambda x: str((bamfiles_list2[x])[1]) + ' ' + str( (bamfiles_list2[x])[2]), indexes)) total = str(len(location)) location = Counter(location) location = Counter.most_common(location) o.write(str(leaders[i]) + '\t' + total) for l in location: o.write('\t' + str(l[0]) + '-' + str(l[1])) o.write('\n')
def addFakeData(oData,oLabels,count=100,low=10): data = oData[:] labels = oLabels[:] for iafsa in range(count): c = Counter(chain(*labels)) lc = Counter.most_common(c) dlc = {} for l in lc: dlc[l[0]] = l[1] #teze = [sum([ dlc[y]**2 for y in x]) for x in labels] teze = [sum([ dlc[y] for y in x]) for x in labels] teze = sorted([(y,x) for x,y in enumerate(teze)]) tt = teze[:max(low*10,200)] shuffle(tt) duplicate = [x[1] for x in tt[:low]] dLabels = [labels[i][:] for i in duplicate] dData = [data[i][:] for i in duplicate] for ii in range(1): for i in range(len(duplicate)): labels.append(dLabels[i]) data.append(dData[i]) #shuflamo vrstice da niso vec lepo, pa poskrbimo da labele ostanejo #pri svojem primeru sd = [] [sd.append((data[i],labels[i])) for i in xrange(len(data))] shuffle(sd) ll = [] dd = [] for x,y in sd: dd.append(x) ll.append(y) return (dd, ll)
def topKFrequent(self, nums, k): lis = [] nums_ctr = Counter.most_common(Counter(nums)) for i in range(k): lis.append(nums_ctr[i][0]) return lis """
def calculate(cls, model, aspects: collections.Counter, min_counts: int = -1): """Constructs word distance matrix from aspects counter.""" words = [] not_in_model = [] n_smaller_count = 0 for (word, counts) in aspects.most_common(): if counts <= 0: raise ValueError("Word {} has {} <= 0 value in the counter. " "Distance matrix requires positive values " "only.".format(word, counts)) if counts > min_counts: if word in model: words.append(word) else: not_in_model.append(word) else: n_smaller_count += 1 print("{} valid words not in the Word2Vec model.".format( len(not_in_model))) print("{} words have count < {} and are ignored.".format( n_smaller_count, min_counts)) print("Calculating matrix with {} words.".format(len(words))) matrix = np.eye(len(words)) for i, word in enumerate(words): matrix[i, i:] = model.distances(word, words[i:]) return cls(words, matrix)
def get_wordcloud_words(): top_headlines = newsapi.get_top_headlines(language='en') query = '' articles = top_headlines['articles'] for i in range(0, len(articles)): query += articles[i]['title'].lower() + " " #Removing stop words stopwordsfile = open('stopwords_en.txt', 'r') stopwords = [line.split('\n')[0] for line in stopwordsfile.readlines()] querywords = query.split() resultwords = [ word for word in querywords if word.lower() not in stopwords ] result = ' '.join(resultwords) from collections import Counter split_it = result.split() Counter = Counter(split_it) most_occur = Counter.most_common(30) #Get 30 most frequent words most_frequent_results = [] for i in range(0, len(most_occur)): mydict = {} mydict['size'] = str(most_occur[i][1]) mydict['word'] = str(most_occur[i][0]) most_frequent_results.append(mydict) return jsonify({'most_frequent_results': most_frequent_results})
def _create_vocab(self, counter: collections.Counter, min_freq: int, max_size: Optional[int], unk_token: Optional[str], pad_token: Optional[str], special_tokens: List) -> Tuple[Dict[str, int], List[str]]: """ Handle the actual vocabulary creation. Tokens that appear less than min_freq times are ignored Once the vocabulary reaches max size, no more tokens are added `unk_token` is the token used to replace tokens not in the vocabulary `pad_token` is used to pad sequences `special_tokens` are other tokens we want appended to the start of our vocabulary, i.e. start of sequence tokens """ stoi = dict() if unk_token is not None: stoi[unk_token] = len(stoi) if pad_token is not None: stoi[pad_token] = len(stoi) for special_token in special_tokens: stoi[special_token] = len(stoi) max_size = max_size - len(stoi) for token, count in counter.most_common(max_size): if count >= min_freq: if token not in stoi: stoi[token] = len(stoi) else: break assert len(stoi) > 0, 'Created vocabulary is empty!' itos = list(stoi.keys()) return stoi, itos
def count_10_word_in_json(): from collections import Counter with open("newsafr.json", encoding="utf-8") as datafile: json_data = json.load(datafile) json_items = json_data["rss"]["channel"]["items"] descriptions = [] for i in json_items: descriptions.append(i["description"].split()) format_description = [] for elem in sum(descriptions, []): if len(elem) > 6: format_description.append(elem.lower()) def sortByLength(inputStr): return len(inputStr) format_description.sort(key=sortByLength, reverse=True) Counter = Counter(format_description) words = Counter.most_common(10) for word in words: pprint(f"Слово: '{word[0]}' встречается: {word[1]} раз")
def count_10_word_in_xml(): from collections import Counter tree = ET.parse("newsafr.xml") descriptions = [] root = tree.getroot() xml_items = root.findall("channel/item") for item in xml_items: description = item.find("description") descriptions += description.text.split() format_description = [] for word in descriptions: if len(word) > 6: format_description.append(word.lower()) def sortByLength(inputStr): return len(inputStr) format_description.sort(key=sortByLength, reverse=True) Counter = Counter(format_description) words = Counter.most_common(10) for word in words: print(f"Слово: '{word[0]}' встречается: {word[1]} раз")
def _initialize_data(self) -> None: """ Loads the dataset and prepares for it to be generated """ data = load_from_csv(self._filepath, self.name_col, self.lat_col, self.lon_col, self.value_col) value_label = data.pop() self._names, self._lats, self._lons, self._values = data self._lat_max = max( self._lats) + self.border_offset + self.north_offset self._lat_min = min( self._lats) - self.border_offset - self.south_offset self._lon_max = max(self._lons) + self.border_offset + self.east_offset self._lon_min = min(self._lons) - self.border_offset - self.west_offset # assigning a number to each unique value provided, and map it to the points count = IterCounter(self._values) # value[0] because value from a enumerate(IterCounter) gives a # tuple of the name of the item and how many of that item # are contained within the count. if self._mode == MODES[0]: self._legend = { value[0]: i + 1 if i + 1 <= len(COLOURS) else len(COLOURS) for i, value in enumerate(count.most_common()) } self._verboseprint(self._legend) elif self._mode == MODES[1]: self._legend = {value_label: 1} self._values = [float(v) for v in self._values]
def Bartel(file_list): for infile in file_list: output = infile.split('.') leader_output = output[0] + 'leader.txt' tenmer_output = output[0] + 'tenmer.txt' leaders_list = [] tenmers_list = [] with open(infile, 'r') as inF: for line in inF: if 'ORIGINAL_SEQUENCE' in line: line_a = line.split('\t') place = line_a.index('ORIGINAL_SEQUENCE') with open(infile, 'r') as inF: for line in inF: leader = (line.split('\t'))[place] leaders_list.append(leader) if len(leader) >= 10: tenmers_list.append(leader[:10]) o = open(leader_output, 'w+') o.write(str(len(leaders_list)) + '\n') for leader_seq in leaders_list: o.write(leader_seq + '\n') o1 = open(tenmer_output, 'w+') o1.write(str(len(tenmers_list)) + '\n') counted = Counter(tenmers_list) newv = Counter.most_common(counted) for x in newv: o1.write(x[0] + '\t' + str(x[1]) + '\n')
def wanabeknn(k=15): from collections import Counter ftrd = open("minidata/trainingData.csv") fted = open("minidata/testData.csv") flab = open("minidata/trainingLabels.csv") lab = [[int(j) for j in i.strip().split(",")] for i in flab.readlines()] trd = [[int(j) for j in i.strip().split("\t")] for i in ftrd.readlines()] ted = [[int(j) for j in i.strip().split("\t")] for i in fted.readlines()] def dist(a,b): return sum([min(a[i], b[i]) for i in xrange(len(a))]) rez = [] for v in ted: print "hurej %4d %3d" % ( len(rez),len(rez[-1:])) t = [] for trindex, train in enumerate(trd): t.append((dist(train, v), trindex)) tt = sorted(t, reverse=True) ll = [] for i in range(k): ll += lab[tt[i][1]] n = len(ll) for i in range(k/3): ll += lab[tt[i][1]] rez.append([x[0] for x in Counter.most_common(Counter(ll),n/k)]) print rez cPickle.dump(rez, file("rezPickled/wnbknn%d.pickled" % k, "w"), -1)
def _most_preferred(self, alternatives): """Applies funcnamei from each trait to the alternatives and return the most preferred.""" prefs = [y for y in [getattr(x, funcnamei)(alternatives) for x in self.traits] if y is not None] if not prefs: return None if len(prefs) == 1: return prefs[0] return Counter.most_common(Counter(prefs), 1)[0][0]
def items(self, relative=False): """ Returns a list of (key, value)-tuples sorted by value, highest-first. With relative=True, the sum of values is 1.0. """ a = Counter.most_common(self) if relative: n = sum(v for k, v in a) or 1. a = [(k, v / n) for v, k in a] return a
def reorganizeString(self, S): """ :type S: str :rtype: str AAAB (3 - 1) * 2 + 1 <= 4 ((most count - 1) * (N + 1)) + Counted value <= len(S), OK. ((most count - 1) * (N + 1)) + Counted value > len(S), Doomed. """ from collections import Counter as CC c = CC(S) most_counted_char = max(c.values()) total_cnt = (most_counted_char - 1) * 2 + \ c.values().count(most_counted_char) if total_cnt > len(S): return "" """ n list of chars. pop 1 by 1 from most to least. """ # create list ll = [] for i in c.most_common(): ll.append([i[0]] * i[1]) # [[2,2,2],[3,3],[1]] # [2,2] [3] [2] [3] # result: # 2 3 2 3 2 1 result = "" # v_v_v_ # vlv_v_ # vlvov while True: flip = 0 for l in ll: if l: if flip < 2: if result and result[-1] == l[-1]: continue result += l.pop() flip += 1 else: break if flip == 0: break return result
def mostCommonWord(self, paragraph: str, banned: List[str]) -> str: count = Counter(re.sub(r'[^\w]', ' ', paragraph.lower()).split()) ban_set = set(banned) # return next(k for k in Counter.most_common(count) if not k[0][0] in ban_set) while True: most = Counter.most_common(count)[0][0] if most in ban_set: count.pop(most) else: return most
def removeLeastCommonData(oData, oLabels, least=5): data = oData[:] labels = oLabels[:] c = Counter(chain(*labels)) lc = Counter.most_common(c) bb = sorted(list(Set([j for i,j in lc]))) a = [x[0] for x in lc if x[1] < bb[5]] rem = [i for i,j in enumerate(labels) if len(Set(j).intersection(Set(a))) > 0 ] [labels.pop(x) for x in sorted(rem, reverse=True)] [data.pop(x) for x in sorted(rem, reverse=True)] return (data, labels)
def Repeat2(input): words = input.lower().replace(',', ' ').replace('.', ' ').replace('?', ' ').split() dict = Counter(words) print(dict) print('most common', Counter.most_common(dict)[0]) for key in words: if dict[key] > 1: print(key, dict[key]) return
def main(): global string_array global char_array global words global final_list global Counter global most_occur with open(args.filepath) as f: d = json.load(f) #spereate info of json file in more usable smaller parts profile_pics = d["profile_pictures"] contacts = d["contacts"] c_list = contacts["list"] chats = d["chats"] chat_list = chats["list"] #get all messages from name for item in chat_list: if len(item) == 4: if item["name"] == args.name: for i in range(len(item["messages"])): string_array = np.append(string_array , item["messages"][i]["text"]) #split and append to word list for item in string_array: if isinstance(item, str): split_it = item.split() words.append(split_it) #add every word to list for item in words: final_list = final_list + item #calculate most common word go = [x.upper() for x in final_list] Counter = Counter(go) most_occur = Counter.most_common(10) #this number can be modfied. Right now only 10 most common words are displayed print(most_occur)
def getList(self, data): from collections import Counter import pandas as pd n=20000 split_it = data.split() Counter = Counter(split_it) most_occur = Counter.most_common(n) vocalList=[] for i in range(n): vocalList.append(most_occur[i][0]) return vocalList
def most_common_viz(output_dir: str, ints: collections.Counter) -> None: df = pd.DataFrame(ints.most_common(), columns=["Integer", "Frequency"]) with open(os.path.join(output_dir, 'index.html'), 'w') as fh: fh.write('<html><body>\n') fh.write('<h3>Most common integers:</h3>\n') fh.write(df.to_html(index=False)) fh.write('</body></html>') with open(os.path.join(output_dir, 'index.tsv'), 'w') as fh: fh.write(df.to_csv(sep='\t', index=False))
def count_code_samples(url): d = {} req = rq.get(url) source_code = req.text pattern = r'<code>(.*?)</code>' results = re.findall(pattern, source_code) for item in results: if item not in d: d[item] = 0 d[item] += 1 return Counter.most_common(d)
def locate_leaders(filename): donor = (re.split('%20|%3a|%29', filename))[8] sample = (re.split('%20|%3a|%29', filename))[10] output = (output_path + '%s_%s_locations.tsv') % (sample, donor) #query_list query_file = ('fishers_output_%s_%s_filteredfdr.txt') % (sample, donor) query_file = query_file.replace('onor', '') print query_file print output if query_file in unfinished: query_file = output_path + query_file query_list = [] with open(query_file, 'r') as inF: next(inF) for line in inF: query = (line.split('\t'))[0] query_list.append(query) filename = path + filename bamfiles_list = [] bamfiles_list2 = [] with open(filename, 'r') as inF: for line in inF: if 'VHE' in line: line = line.split('\t') if line[1] == '0': bamfiles_list.append(line[9]) bamfiles_list2.append( [line[9], line[2], (str(line[3]) + ',+')]) elif line[1] == '16': bamfiles_list.append(line[9]) bamfiles_list2.append( [line[9], line[2], (str(line[3]) + ',-')]) else: pass o = open(output, 'ab+') for i in range(0, len(query_list)): value = list( map(lambda x: str(x).startswith(query_list[i]), (bamfiles_list))) indexes = ([y for y, j in zip(count(), value) if j == True]) location = list( map( lambda x: str((bamfiles_list2[x])[1]) + ' ' + str( (bamfiles_list2[x])[2]), indexes)) total = str(len(location)) o.write(str(query_list[i]) + '\t' + total) location = Counter(location) location = Counter.most_common(location) for l in location: o.write('\t' + str(l[0]) + ':' + str(l[1])) o.write('\n') return output
def getKeyWords(Counter, num, country): # Gets the news data from the news api website news_url = ('https://newsapi.org/v2/top-headlines?' 'country={}&' 'apiKey=a8a1a5ea66c04f1488210e7b0016b948'.format(country.lower())) response = requests.get(news_url) # List of stop words that are to be removed from our list of words stopwords = ["I", "Me", "My", "Myself", "We", "Our", "Ours", "Ourselves", "You", "Your", "Yours", "Yourself", "Yourselves", "He", "Him", "His", "Himself", "She", "Her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", "-", "News", "The", "|", "CNN", "CBS", "BBC", "Guardian", "says", "news", "Daily", "Mail", "Online", "don't", "Mirror", "After", "NPR", "Washington", "Post", "", "new", "Sky", "ITV", "could" , "suggests", "fears", "live", "say"] # Variable to store the list of headlines words = '' # Store the headlines in the declared variable for i in response.json()['articles']: words += i['title'] + ' ' # Splits the headlines into an array where each word is its own value words2 = words.split() # Removes special characters words2 = [re.sub('[^a-zA-Z]+', '', x) for x in words2] # Counts the occurence of each word Counter = Counter(words2) # Removes the words in the stop word array from the list for i in stopwords: del Counter[i[:1].upper() + i[1:]] del Counter[i[:1].lower() + i[1:]] del Counter[i.upper()] del Counter[i.lower()] # Returns list of most common words return Counter.most_common(num)
def count_frequent_words(): global Counter df = pd.read_csv('../../csv_files/scmp_article_content_all.csv') text_dataset = df['paragraphs'] #print(type(text_dataset.head())) split_it = text_dataset.str.split() #print(split_it) #counter = Counter(split_it) Counter = Counter(x for xs in split_it for x in xs) most_occur = Counter.most_common(10) print(most_occur) return most_occur
def output_most_common(directory): files = glob.glob(os.path.join(directory, "*.txt")) for file in files: file = strip_stop_words(file).split() # with open(file, 'w') as csv_file: # writer = csv.writer(csv_file) # writer.writerow(['Word', 'Frequency']) # for key, value in hold_diffs.items(): # writer.writerow([key, value]) # return file print(Counter=Counter.most_common(50))
def removeMostCommonData(oData, oLabels, count=20): data = oData[:] labels = oLabels[:] for iafsa in range(count): c = Counter(chain(*labels)) lc = Counter.most_common(c) dlc = {} for l in lc: dlc[l[0]] = l[1] teze = [max([ dlc[y] for y in x]) for x in labels] teze = sorted([(y,x) for x,y in enumerate(teze)]) rem = [x[1] for x in teze[-10:]] [labels.pop(x) for x in sorted(rem, reverse=True)] [data.pop(x) for x in sorted(rem, reverse=True)] return (data, labels)
def write_token_vocab(words: collections.Counter, save_path: Text) -> None: """"Writes token vocabulary from @words to @save_path.""" # Sort tokens by frequency and then lexically to break ties. words_with_counts = words.most_common() words_with_counts.sort(key=lambda x: (x[1], x[0]), reverse=True) vocab_path = os.path.join(save_path, 'vocab.cfq.tokens') with open(vocab_path, 'w') as f: # Tensor2tensor needs these additional tokens. f.write('<pad>\n<EOS>\n<OOV>\n') for word, _ in words_with_counts: f.write(f'{word}\n') logging.info(f'Token vocabulary written to {vocab_path} ({len(words)} ' 'distinct tokens).')
def Counter1(filename): with open(filename, 'r') as file: datastring = file.read().replace('\n', '') # split() returns list of all the words in the string blob=TextBlob(datastring) qaz=blob.noun_phrases decriptors_series=[] for i in range(0,len(qaz)): decriptors_series.append(qaz[i]) Counter = Counter(decriptors_series) most_occur = Counter.most_common(10) return most_occur
def comcap(instaid): idlink = 'https://www.instagram.com/{}/?__a=1' url = idlink.format(instaid) data = requests.get(url).json() if (data['graphql']['user']['is_private']): return 'oops its a private account' user = data['graphql']['user']['edge_owner_to_timeline_media']['edges'] num = len(user) #print(num) allcap = '' for i in range(0, num): try: x = user[i]['node']['edge_media_to_caption']['edges'][0]['node'][ 'text'] m = re.findall(r'[#]\w+', x) t = '' for j in m: t = t + j + ' ' t = re.sub(r'\#', ' ', t) t = t.lower() allcap = allcap + t + ' ' except: continue from collections import Counter data_set = allcap # split() returns list of all the words in the string split_it = data_set.split() # Pass the split_it list to instance of Counter class. Counter = Counter(split_it) #print(Counter) # most_common() produces k frequently encountered # input values and their respective counts. most_occur = Counter.most_common(8) txt = '' for i in most_occur: txt = txt + '#' + i[0] + ' ' #print(txt) return txt
def MostFrequent(): from collections import Counter data_set = "Welcome to the world of Geeks " \ "This portal has been created to provide well written well" \ "thought and well explained solutions for selected questions " \ "If you like Geeks for Geeks and would like to contribute " \ "here is your chance You can write article and mail your article " \ " to contribute at geeksforgeeks org See your article appearing on " \ "the Geeks for Geeks main page and help thousands of other Geeks. " \ split_it = data_set.split() Counter = Counter(split_it) most_occur = Counter.most_common(9) print(most_occur)
def most_common(self, n, conts): """Returns most frequent word""" return Counter.most_common(conts)
from collections import Counter text = "In February 2014, I made a recommendation to my co - founders at" \ "Ballistiq that I wanted to cancel development of ArtStation." \ "The project was in development hell. It wasn’t going anywhere." \ "I was unhappy with it and just couldn’t see a path for it to be a"\ "successful product. Two months later we managed to launch it," \ "and two years later it is the leading network for professional games." words = text.split() Counter = Counter(words) top_three = Counter.most_common(3) print(top_three)
from collections import Counter hairy = open("RFand1R-2-2") majcn = open("resultRF-0.72388.csv") smotko = open("smotkoTopScore.csv") zidar = open("result-0.36027.csv") aaa = open("resultRF-0.85714.csv") a = [] a.append([[int(j) for j in i.strip().split(" ")] for i in majcn.readlines()]) a.append([[int(j) for j in i.strip().split(" ")] for i in zidar.readlines()]) a.append([[int(j) for j in i.strip().split(" ")] for i in smotko.readlines()]) a.append([[int(j) for j in i.strip().split(" ")] for i in hairy.readlines()]) a.append([[int(j) for j in i.strip().split(" ")] for i in aaa.readlines()]) b = [] for i in xrange(len(a[0])): c = Counter(a[0][i]+a[1][i]+a[2][i]+a[3][i]+a[4][i]) b.append([x[0] for x in Counter.most_common(c) if x[1]>1]) if len(x)==0 : x = [40, 44, 18, 62, 41] print "aa" for i in b: if len(i) == 1: print "aaa" print "bb" f = file("RFand1R-2-2","w") f.write("\n".join( [" ".join([str(x) for x in i]) for i in b ] )) f.flush() f.close()
from collections import Counter files = [open("knn.txt"), open("knnm.txt"), open("RF200por.txt"), open("test.txt")] a = [] for f in files: a.append([[int(j) for j in i.strip().split(" ")] for i in f.readlines()]) b = [] for i in xrange(len(a[0])): c = Counter(a[0][i]+a[1][i]+a[2][i]) x = [x[0] for x in Counter.most_common(c) if x[1]>2] if len(x)==0 : x = [40] b.append(x) print "done" f = file("skupniNoTest.csv","w") f.write("\n".join( [" ".join([str(x) for x in i]) for i in b ] )) f.close()
# print Counter(foundEmosAndSmilies) # print("found smilies top50") # print Counter.most_common(Counter(foundEmosAndSmilies), 50) # print("total number of sentences with smilies:") # print numOfTotalSentences """ Classify """ for message in messages: messagecounter += 1 # for every sentence score, words = emoCount.score(message) foundEmosAndSmilies = foundEmosAndSmilies + words if len(words) != 0: numOfTotalSentences += 1 print(words) # in the end print("Messages total:") print messagecounter print("found smilies total:") print sum(Counter(foundEmosAndSmilies).values()) print("found smilies examples ordered:") print Counter(foundEmosAndSmilies) print("found smilies top50") print Counter.most_common(Counter(foundEmosAndSmilies), 50) print("total number of sentences with smilies:") print numOfTotalSentences