def __init__(self, email_address): self.email_address = email_address self.ins = counter.Counter() # For the out nodes a count makes PageRank faster. self.outs = counter.Counter() self.outs_count = 0 self.outs_ids = set()
def run_tests(sample, signal): # counter with default hypervalues default_counter = counter.Counter(sample) # signal created from sample, copied multiple times and distrorted df_s = tests.generate_tests_from_sample(sample) # signal, original and distorted multiple times df = tests.generate_tests(signal) def show_result(name, counter, signal, do_print=False, plot=True): count_result = counter.count(signal) format_str = "{}:\nunified_count: {}, sub_counts:{}\nnn_count:{}, nn_sub_count:{}\n" to_print = format_str.format(name, count_result['unified_count'], count_result['max_counts'], count_result['unified_marked_count'], count_result['max_marked_counts']) print(to_print) if plot==True: plotting.plot_extrema_df_marked(signal, name, count_result) print("REAL DATA") for name in df: show_result(name, default_counter, df[name], True, True) print("SYNTHETIC DATA") # for name in df_s: show_result(name, default_counter, df_s[name], True, True)
def main(): sorts = [ Sort_Interface("Bubble Sort", bubble_sort.BubbleSort), Sort_Interface("Shaker Sort", shaker_sort.ShakerSort), Sort_Interface("Selection Sort", selection_sort.SelectionSort), Sort_Interface("Quick Sort", qs.quick_sort_starter), Sort_Interface("Modified Quick Sort", mqs.modified_quick_sort_starter), Sort_Interface("Counting Sort", counting_sort.CountingSort), ] for s in range(3, 13): size = 2**s print(s) for i in range(len(sorts)): r = random_data.CreateRandomData(size) c = counter.Counter() sorts[i].method(r, c) compares = swaps = 0 if c.Compares > 0: compares = math.log(c.Compares, 2) if c.Swaps > 0: swaps = math.log(c.Swaps, 2) print(" // ---------- %s - size = %d ---------- " % (sorts[i].name, size)) print("compares = %d \n swaps = %d " % (compares, swaps))
def __init__(self, file_id, text, use_zlib, bit_size): self.id = file_id self.tokens = text.split() token_counter = counter.Counter(self.tokens) self.exact_fingerprint = self._adler_32(text, use_zlib) self.near_fingerprint = simhash.hash(token_counter, bit_size) self.near_fingerprint_buckets = [] self.plateau_fingerprint = None self.plateau_fingerprint_buckets = [] plateau = finn.find_plateau(self.tokens) if plateau is not None: plateau_counter = counter.Counter(plateau) self.plateau_fingerprint = simhash.hash(plateau_counter, bit_size)
def __init__(self, legalLabels, max_iterations): self.legalLabels = legalLabels self.type = "perceptron" self.max_iterations = max_iterations self.weights = {} for label in legalLabels: self.weights[label] = counter.Counter( ) # this is the data-structure you should use
def finn_detection(speech_set, use_groups, similarity_distance): """Performs a finn duplicate detection. The finn duplicate detection operates across plateaus in the texts, comparing a near-fingerprint for each to other documents plateaus. The similarity_distance variable sets the distance to accept matches at. If use_groups is set, then this is the maximum number of non-matching groups for two plateaus to be found equal. Otherwise, it's the maximum Hamming distance between two plateaus for them to be found equal.""" overlapping_speeches = set() if use_groups: # To do a L-groups-of-k-bits overlap detection, iterate over every speech # and look for other speeches who are in a large number of the same # buckets. O(nd), where d is the maximimum number of plateaus in any # bucket of any group. for speech in speech_set.speeches: # Skip speeches without plateaus. if speech.plateau_fingerprint is None: continue number_buckets = len(speech.near_fingerprint_buckets) match_counter = counter.Counter() for bucket in speech.plateau_fingerprint_buckets: match_counter.update(bucket) for (match, count) in match_counter.most_common(): # Don't want to match ourself! if speech == match: continue if (number_buckets - count) > similarity_distance: # No more matches will be close enough. break # Order the pair to avoid duplicates. if speech.id < match.id: overlapping_speeches.add((speech.id, match.id)) else: overlapping_speeches.add((match.id, speech.id)) else: # Brute force search of the plateaus. pairs = itertools.combinations(speech_set.speeches, 2) for (a, b) in pairs: if a.plateau_fingerprint is None or b.plateau_fingerprint is None: continue distance = hamming_distance(a.plateau_fingerprint, b.plateau_fingerprint) if distance <= similarity_distance: overlapping_speeches.add((a.id, b.id)) return overlapping_speeches
def __init__(self, featurizer, classer, classes): """ Create a new classifier. Args: featurizer: A function argument that is given a data sample and returns a list of features, which cannot ever be empty. classer: A function that returns the class of a data sample. classes: The possible classes the classifier will see. Since we might not see all classes in the data, this needs to be explicitly specified. """ self.featurizer = featurizer self.classer = classer self.classes = classes self.class_counts = counter.Counter() self.feature_counts = collections.defaultdict(lambda: counter.Counter()) self.class_to_feature_counts = counter.Counter()
def __init__(self): """ This starts the Tk framework up, instantiates the Model (a Counter object), instantiates the View (a MyFrame object), and starts the event loop that waits for the user to press a Button on the View. """ root = tkinter.Tk() self.model = counter.Counter() self.view = myFrame.MyFrame(self) self.view.mainloop() root.destroy()
def count(self): for file in self.files: try: file_type, line_count = counter.Counter(file).result try: self.line_counts[file_type] += line_count except KeyError: self.line_counts[file_type] = line_count except ValueError: # Not a text file continue
def __init__(self, logfile): self.logfile = logfile self.url = [] self.parent_url = [] self.is_seed = [] self.hop = [] self.crawl_date = [] self.content_type = [] self.anchor_text = [] self.nline = {'total': 0, 'skipped': 0, 'parsed': 0} self.counters = counter.Counter() self.counters.newCounter('Accepted') self.counters.newCounter('Accepted_isSeed')
def best_tfidf(self): """Find the best word according to tf.idf. Not used for reasons explained in the report.""" for (i, email) in enumerate(self.emails): print "\t%s" % i email.tfidf = counter.Counter() for word in email.words_counter: tf_d = email.words_counter[word] df = len(self.inverted_index[word]) idf = math.log(self.number_emails / float(df)) squasher = float(2 * email.length) / self.avg_length() score = (tf_d / (tf_d + squasher)) * idf email.tfidf[word] = score overall_tfidfs = counter.Counter() for email in self.emails: overall_tfidfs += email.tfidf return overall_tfidfs.most_common(1)[0][0]
class TestCounter(unittest.TestCase): """Tests the Counter class methods. Requires a redis running on localhost:6379 """ # Initiate a Counter instance with connection to a local redis c = counter.Counter('localhost', 6379, 0, 'count', os.getenv("REDIS_PASS")) def test_redis_connection(self): """Tests if c can connect to the redis """ resp = self.c.r.execute_command('ping') self.assertEqual(resp, b'PONG') def test_get(self): """Tests Counter.get() """ # Test if method returns 0 if value does not exist self.c.r.delete('count') self.assertEqual(0, self.c.get(), "get() did not return 0 on none existing key") # Test if method returns the value set in redis self.c.r.set('count', 5) self.assertEqual(5, self.c.get(), "get() did not return the correct value") def test_incr(self): """Tests Counter.incr() """ # clear the redis key self.c.r.delete('count') # test that non-existent key is initialized self.c.incr() self.assertEqual(b'1', self.c.r.get('count'), "incr() did not initialized the correct value") # test that the values is increased by amount amount = 5 ival = int(self.c.r.get('count')) rval = self.c.incr(amount) cval = int(self.c.r.get('count')) self.assertEqual(amount, cval - ival, "incr() did not increase the value correctly") self.assertEqual(cval, rval, "incr() did not return the resulting value")
def classify(self, data): """ Classifies each datum as the label that most closely matches the prototype vector for that label. See the project description for details. Recall that a datum is a counter.counter... """ guesses = [] for datum in data: vectors = counter.Counter() for l in self.legalLabels: vectors[l] = self.weights[l] * datum guesses.append(vectors.argMax()) return guesses
def _calculate_overlap(query, inverted_index): """Calculate the overlaps between a query and all documents. An inverted index is used so that only the documents containing the query words are examined.""" # Counts the number of each document in the inverted index for each # word in the query. That is, if the query was "bob marley" and we # had {"bob" -> [d1, d4, d6], "marley" -> [d2, d4, d5]}, the result # would be a counter of {d1:1, d2:1, d4:2, d5:1, d6:1}. document_overlaps = counter.Counter() for word in query.words_counter: document_overlaps.update(inverted_index[word]) return [(query, d, score) for (d, score) in document_overlaps.most_common()]
def basicFeatureExtractorDigit(datum): """ Returns a set of pixel features indicating whether each pixel in the provided datum is white (0) or gray/black (1) """ a = datum.getPixels() features = counter.Counter() for x in range(DIGIT_DATUM_WIDTH): for y in range(DIGIT_DATUM_HEIGHT): if datum.getPixel(x, y) > 0: features[(x,y)] = 1 else: features[(x,y)] = 0 return features
def basicFeatureExtractorFace(datum): """ Returns a set of pixel features indicating whether each pixel in the provided datum is an edge (1) or no edge (0) """ a = datum.getPixels() features = counter.Counter() for x in range(FACE_DATUM_WIDTH): for y in range(FACE_DATUM_HEIGHT): if datum.getPixel(x, y) > 0: features[(x,y)] = 1 else: features[(x,y)] = 0 return features
def main(confc): logger = logging.getLogger("main") #conf = Config_mascrawlloggen() # create counters counters = counter.Counter() # create output file fcsv = open(confc["outputfile"],'w') csvwriter = csv.writer(fcsv,quoting=csv.QUOTE_ALL) header = ("id","fullpath","url","parenturl","datetime","hop","content_type") csvwriter.writerow(header) # work through the download directory for all .txt files pdfi = 0 logger.info("traversing data directory") for root,dirs,files in os.walk(confc["datadir"]): for f in files: if f.endswith('.txt'): url_fetched = get_url_fetched(os.path.join(root,f)) if not url_fetched: continue pdfpath = os.path.join(root,os.path.splitext(f)[0]+'.pdf') print_prog(pdfi,len(files),step=1000,comments=pdfpath) if not os.path.exists(pdfpath): logger.warning("pdf file not found: "+pdfpath) continue counters.addCounter('all') pdfi += 1 # first try to get date time from file name # if not work, try to get it from the file itself datetimestr = f.split("_")[0] try: dt = datetime.strptime(datetimestr,"%Y-%m-%d-%H-%M-%S") except ValueError: (f_mode, f_ino, f_dev, f_nlink, f_uid, f_gid, f_size, f_atime, f_mtime, f_ctime) = os.stat(pdfpath) f_mtime = time.ctime(f_mtime) dt = datetime.strptime(f_mtime,"%a %b %d %H:%M:%S %Y") parenturl = "" hop = 0 content_type = 'application/pdf' outline = (pdfi,pdfpath,url_fetched,parenturl,str(dt),hop,content_type) csvwriter.writerow(outline) fcsv.close() logger.info("file output to: "+fcsv.name)
def main(): # The following causes the constructor to be invoked: __init__() score_counter = counter.Counter() # Create a counter object print(score_counter.get_count()) # Display the current value of counter for points in range(10): # Increment the counter 10 times score_counter.increment() print(score_counter.get_count()) # Display the current value of counter for points in range(5): # Decrement the counter 5 times score_counter.decrement() print(score_counter.get_count()) # Display the current value of counter score_counter.reset() # Reset the counter print(score_counter.get_count()) # Display the current value of counter # The following causes the 'to string' method to be invoked: __str__() print(score_counter) # Displays str representation of counter score_counter.set_count(100) # Set counter to 100 print(score_counter) # Displays str representation of counter
def main(): """Creates a graph.dot file with interesting information.""" print "Parsing wikipedia.txt" wiki_words = get_wikipedia_words("data/wikipedia.txt") # Parse the aliases.txt file. print "Parsing aliases.txt" (aliases, inverse_aliases) = get_aliases("data/aliases.txt") # Parse the roles.txt file. print "Parsing roles.txt" employees_info = info.get_employees_map("data/roles.txt", inverse_aliases) # Parse the graph.txt file to get the email graph. print "Parsing graph.txt" (_, email_graph) = graphs.process_file("data/graph.txt", inverse_aliases) interesting_nodes = get_interesting_nodes(email_graph, aliases) relations = get_relations(interesting_nodes, employees_info, wiki_words) # Write the resultant graph. print "Writing results." with open("graph.dot", "w") as f: f.write('digraph G {\n') print "%s relations" % len(relations) for (i, relation) in enumerate(sorted(relations)): print "%s" % i a = relation.from_info b = relation.to_info # Choose the most common word in the email subjects. c = counter.Counter() for email in relation.emails.emails: if email.subject is not None: c.update(email.subject) if len(c) > 0: best_word = c.most_common(1)[0][0] else: best_word = "" f.write('"%s" -> "%s" [label = "%s"];\n' % (a.description(), b.description(), best_word)) f.write('}\n')
def main(): # Argument parsing args = parseArguments() # Counter class, holds all 4 counters counter = c.Counter() # Parse inputted weights for generator weights = [args.__dict__[chr(97 + x)] / 26 for x in range(0, 26)] # Initializing and generating text gen = gt.GenText() dataset = gen.generate(args.letters, weights) # All 4 counters plus the exact counter ec = counter.ExactCount(dataset) mg = counter.MisraGries(dataset, args.kappa) mm = counter.MankuMotwani(dataset, args.kappa) m = counter.Metwally(dataset, args.kappa) cms = counter.Count_Min_Sketch(dataset, args.error, args.factor) smallest = 3 print("Exact Count") print((sorted(ec[0][:smallest], reverse=True), ec[1], ec[2])) print("MisraGries Score") print(scoreCalc(ec, mg, smallest)) print("Accuracy: " + str(accuracy(ec[0], mg[0], smallest)) + "%") print((mg[0][:smallest], mg[1], mg[2])) print("MankuMotwani Score") print(scoreCalc(ec, mm, smallest)) print("Accuracy: " + str(accuracy(ec[0], mm[0], smallest)) + "%") print((mm[0][:smallest], mm[1], mm[2])) print("Metwally Score") print(scoreCalc(ec, m, smallest)) print("Accuracy: " + str(accuracy(ec[0], m[0], smallest)) + "%") print((m[0][:smallest], m[1], m[2])) print("Count_Min_Sketch Score") print(scoreCalc(ec, cms, smallest)) print("Accuracy: " + str(accuracy(ec[0], cms[0], smallest)) + "%") print((cms[0][:smallest], cms[1], cms[2]))
def run(self): """ This function doing something depending on users' choice. """ account = counter.Counter() account.set_operations(self.get_data_from_file()) i = 0 while i != 9: i = self.user_input(lambda: input("1 - Show operations history \n" "2 - Add new operation \n" "3 - Get operations by money \n" "4 - Get operation by description \n" "5 - Get operation by date \n" "6 - Get balance \n" "7 - Clear operations history \n" "8 - Change file \n" "9 - Exit \n")) if i == 1: for i in account: print(i) elif i == 2: account.add_operation(self.get_new_operation()) self.save_data_into_file(account.get_operations()) elif i == 3: for k in account.get_operations_by_money( int(self.user_input( lambda: input("Enter money value = ")))): print(k) elif i == 4: for k in account.get_operations_by_description( input("Enter description value: ")): print(k) elif i == 5: for k in account.get_operations_by_date(self.get_datetime()): print(k) elif i == 6: print("balance = " + str(account.get_balance())) elif i == 7: account.delete_operations() elif i == 8: account.set_operations(self.get_data_from_file()) elif i != 9: print("Wrong choice, try again")
def main(): pc = ProgressChecker() c = counter.Counter() totalsize = 0 bytype = dict() states = dict() processed_states = dict() for d in argv[1:]: todo = 0 for ftmp in os.listdir(d): if string.find(ftmp, ".zip") >= 0: f = d + "/" + ftmp if pc.isFinished(f): processed_states[os.path.basename(d)] = 1 else: nr, nrbytype = c.count(f) todo += nr k = os.path.basename(d) if states.has_key(k): states[k] += nr else: states[k] = nr for k in nrbytype.keys(): if bytype.has_key(k): bytype[k] += nrbytype[k] else: bytype[k] = nrbytype[k] if todo > 0: totalsize = totalsize + todo print str(todo) + ": " + d print "Have about " + str(totalsize) + " records to add." a = bytype.keys() a.sort() for k in a: print "Records of type " + k + ": " + str(bytype[k]) print "Have not added any records for the following states:" a = states.keys() a.sort() for k in a: if not processed_states.has_key(k): print "\t" + k + "\t" + str(states[k])
def startup(verbal=False): # initial setups # "crawldir" contains all crawled documents including ".pdf" and ".txt" files crawldir = '/msa/crawl/directory/crawler-out' # "outputlogpath" is the output file path outputlogpath = "/msa/crawl/directory/crawl.log" # open output file fout = open(outputlogpath, 'w') # create counters counters = counter.Counter() counters.newCounter('txt') counters.newCounter('pdf') # walk through the crawl directory for all .txt files for root, dirs, files in os.walk(crawldir): for f in files: if f.endswith('.txt'): counters.addCounter('txt') # load the txt file lines = file(os.path.join(root, f)).readlines() for line in lines: line = line.strip('\n') if line.startswith("Fetched"): url_fetched = line[9:] break # write filename and URL into output file pdfpath = os.path.splitext(f)[0] + '.pdf' lineout = ' '.join([pdfpath, url_fetched]) + '\n' fout.write(lineout) if verbal: print lineout elif f.endswith('.pdf'): counters.addCounter('pdf') else: print 'unknown file extension: ', file fout.close() counters.setCounter('all', counters.txt + counters.pdf) counters.printCounter() print 'file output to: ' + outputlogpath
def get_all_possibilities(word): print(word) letters = list(word) letters_in_TCHSET = map(lambda x : "ENG_"+ x, letters) possibilities_in_TCHSET = map(lambda x: eng_to_readablechars[x] if eng_to_readablechars[x] else x, letters_in_TCHSET) possibilities_per_slot = map(lambda x: len(x) if x else 1, possibilities_in_TCHSET) if possibilities_per_slot == 0: return [word] possibility_counter = counter.Counter(len(word), possibilities_per_slot) generate_word = get_word_generator(possibilities_in_TCHSET) all_possibilities = [] selection_number = possibility_counter.get_curr_value() all_possibilities.append(generate_word(selection_number)) while possibility_counter.can_increment(): possibility_counter.increment() selection_number = possibility_counter.get_curr_value() all_possibilities.append(generate_word(selection_number)) all_possibilities = map(lambda x : "".join(x), all_possibilities) return all_possibilities
def calculateLogJointProbabilities(self, datum): """ Returns the log-joint distribution over legal labels and the datum. Each log-probability should be stored in the log-joint counter, e.g. logJoint[3] = <Estimate of log( P(Label = 3, datum) )> To get the list of all possible features or labels, use self.features and self.legalLabels. """ logJoint = counter.Counter() evidence = datum.items() "*** YOUR CODE HERE ***" for y in self.legalLabels: if self.Pr_distribution_of_features[y] != 0: logJoint[y] = math.log(self.Pr_distribution_of_features[y]) else: logJoint[y] = 0 for f in self.conditionals: prob = self.conditionals[f][datum[f]][y] logJoint[y] += (prob and math.log(prob) or 0.0) return logJoint
def calculate_similarity(self, query_file, data_file, filename): """Calculate the similarity between a query file and a data file. The results are written to a file named "filename".""" queries_set = doc.DocumentSet(query_file) documents_set = doc.DocumentSet(data_file) results = [] for query in queries_set.documents: # Compute the initial tfidfs. initial_tfidfs = self.tf_idf._tfidf(query, documents_set) # Select the top n_d scoring documents. initial_tfidfs = sorted([(-s, d) for (_, d, s) in initial_tfidfs]) initial_tfidfs = [(d, -s) for (s, d) in initial_tfidfs[:self.n_d]] selected_docs = [document for (document, _) in initial_tfidfs] # Combine the top documents into a 'mega document'. summed_counter = counter.Counter(query.words_counter) for document in selected_docs: summed_counter += document.words_counter mega_document = doc.document_from_dict(None, dict(summed_counter)) # Select the top n_w scoring words (via tf.idf) from the megadocument. word_scores = [] for word in sorted(list(mega_document.words_counter)): score = self.tf_idf._document_tfidf(word, mega_document, documents_set) word_scores.append((-score, word)) word_scores = sorted(word_scores)[:self.n_w] word_scores = [(word, -score) for (score, word) in word_scores] # Use these new words as the next query, and return the tf.idf scores. new_query = doc.document_from_dict(query.id, dict(word_scores)) results.extend(self.tf_idf._tfidf(new_query, documents_set)) output.write_output_file(filename, results)
import counter # import joker import test_joker addons = [ counter.Counter(), # joker.Joker(), test_joker.Joker() ]
def train(self, trainingData, trainingLabels, validationData, validationLabels): """ Trains the classifier by collecting counts over the training data, and stores the Laplace smoothed estimates so that they can be used to classify. Evaluate each value of k in kgrid to choose the smoothing parameter that gives the best accuracy on the held-out validationData. trainingData and validationData are lists of feature Counters. The corresponding label lists contain the correct label for each datum. To get the list of all possible features or labels, use self.features and self.legalLabels. """ # might be useful in your code later... # this is a list of all features in the training set. pre_fs = [] for datum in trainingData: for key in datum.keys(): pre_fs.append(key) feature_set = set(pre_fs) self.features = list(feature_set) #sys.exit(1) spgrid = [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 20, 50] Pr_distribution_of_features = counter.Counter() for label in trainingLabels: Pr_distribution_of_features[label] += 1 Pr_distribution_of_features.normalize() self.Pr_distribution_of_features = Pr_distribution_of_features # Initialize stuff binary_feature_counts = {} totals = {} for f in self.features: binary_feature_counts[f] = { 0: counter.Counter(), 1: counter.Counter() } totals[f] = counter.Counter() # Calculate totals and binary feature counts # enumerate(thing) returns an iteratur that will return # (0, thing[0]), (1, thing[1]), (2, [thing[2]), ... for i, datum in enumerate(trainingData): y = trainingLabels[i] for f, value in datum.items(): binary_feature_counts[f][value][y] += 1.0 totals[f][y] += 1.0 bestConditionals = {} bestAccuracy = None # Evaluate each k, and use the one that yields the best accuracy for sp in spgrid or [0.0]: correct = 0 conditionals = {} for f in self.features: conditionals[f] = {0: counter.Counter(), 1: counter.Counter()} # Run Laplace smoothing for f in self.features: for value in [0, 1]: for y in self.legalLabels: conditionals[f][value][y] = ( binary_feature_counts[f][value][y] + sp) / (totals[f][y] + sp * 2) # Check the accuracy associated with this k self.conditionals = conditionals guesses = self.classify(validationData) for i, guess in enumerate(guesses): correct += (validationLabels[i] == guess and 1.0 or 0.0) accuracy = correct / len(guesses) # Keep the best k so far if accuracy > bestAccuracy or bestAccuracy is None: bestAccuracy = accuracy bestConditionals = conditionals self.sp = sp self.conditionals = bestConditionals
import os import socketserver import counter import parse_datagram import pretty_json COUNTER = counter.Counter() HOST = "0.0.0.0" METRICS_PORT = int(os.environ.get("METRICS_PORT", 8125)) class Handler(socketserver.BaseRequestHandler): PREFIX_SUFFIX = METRICS_PORT def handle(self): count = COUNTER.increment() # There is no difference between request count / metric count as there # is for the trace server. prefix = f"{count:03d}-{count:03d}-{self.PREFIX_SUFFIX} | " data = self.request[0].strip() data_parsed = parse_datagram.parse_metric(data) try: colorful_json = pretty_json.printable_json(data_parsed, prefix) print(f"{prefix}metric =") print(colorful_json, end="") except:
import counter import coverage counter.Counter.connect() instance = counter.Counter() def testGet(): instance.setCount(65535) assert instance.getCount() == 65535 def testIncrement(): instance.setCount(655) instance.increment() assert instance.getCount() == 656 def testSet(): instance.setCount(3333333) assert instance.getCount() == 3333333 def testNegativeDecrement(): instance.setCount(0) assert instance.decrement() == False def testComplex(): instance.setCount(65535) instance.increment()