Beispiel #1
0
 def __init__(self, email_address):
     self.email_address = email_address
     self.ins = counter.Counter()
     # For the out nodes a count makes PageRank faster.
     self.outs = counter.Counter()
     self.outs_count = 0
     self.outs_ids = set()
Beispiel #2
0
def run_tests(sample, signal):

    # counter with default hypervalues
    default_counter = counter.Counter(sample)

    # signal created from sample, copied multiple times and distrorted    
    df_s = tests.generate_tests_from_sample(sample)

    # signal, original and distorted multiple times
    df = tests.generate_tests(signal)

    def show_result(name, counter, signal, do_print=False, plot=True):   
        count_result = counter.count(signal)
        format_str = "{}:\nunified_count: {}, sub_counts:{}\nnn_count:{}, nn_sub_count:{}\n"
        to_print = format_str.format(name, count_result['unified_count'], count_result['max_counts'], count_result['unified_marked_count'], count_result['max_marked_counts'])       
        print(to_print)

        if plot==True: 
            plotting.plot_extrema_df_marked(signal, name, count_result)

    print("REAL DATA")
    for name in df:
        show_result(name, default_counter, df[name], True, True)

    print("SYNTHETIC DATA")
    #
    for name in df_s:
        show_result(name, default_counter, df_s[name], True, True)
Beispiel #3
0
def main():
    sorts = [
        Sort_Interface("Bubble Sort", bubble_sort.BubbleSort),
        Sort_Interface("Shaker Sort", shaker_sort.ShakerSort),
        Sort_Interface("Selection Sort", selection_sort.SelectionSort),
        Sort_Interface("Quick Sort", qs.quick_sort_starter),
        Sort_Interface("Modified Quick Sort", mqs.modified_quick_sort_starter),
        Sort_Interface("Counting Sort", counting_sort.CountingSort),
    ]

    for s in range(3, 13):
        size = 2**s
        print(s)

        for i in range(len(sorts)):
            r = random_data.CreateRandomData(size)
            c = counter.Counter()
            sorts[i].method(r, c)

            compares = swaps = 0
            if c.Compares > 0:
                compares = math.log(c.Compares, 2)

            if c.Swaps > 0:
                swaps = math.log(c.Swaps, 2)
            print(" // ---------- %s - size = %d ---------- " %
                  (sorts[i].name, size))
            print("compares = %d \n swaps = %d " % (compares, swaps))
Beispiel #4
0
    def __init__(self, file_id, text, use_zlib, bit_size):
        self.id = file_id

        self.tokens = text.split()
        token_counter = counter.Counter(self.tokens)

        self.exact_fingerprint = self._adler_32(text, use_zlib)

        self.near_fingerprint = simhash.hash(token_counter, bit_size)
        self.near_fingerprint_buckets = []

        self.plateau_fingerprint = None
        self.plateau_fingerprint_buckets = []
        plateau = finn.find_plateau(self.tokens)
        if plateau is not None:
            plateau_counter = counter.Counter(plateau)
            self.plateau_fingerprint = simhash.hash(plateau_counter, bit_size)
 def __init__(self, legalLabels, max_iterations):
     self.legalLabels = legalLabels
     self.type = "perceptron"
     self.max_iterations = max_iterations
     self.weights = {}
     for label in legalLabels:
         self.weights[label] = counter.Counter(
         )  # this is the data-structure you should use
Beispiel #6
0
def finn_detection(speech_set, use_groups, similarity_distance):
    """Performs a finn duplicate detection.

  The finn duplicate detection operates across plateaus in the texts,
  comparing a near-fingerprint for each to other documents plateaus.

  The similarity_distance variable sets the distance to accept matches at. If
  use_groups is set, then this is the maximum number of non-matching groups for
  two plateaus to be found equal. Otherwise, it's the maximum Hamming distance
  between two plateaus for them to be found equal."""

    overlapping_speeches = set()

    if use_groups:
        # To do a L-groups-of-k-bits overlap detection, iterate over every speech
        # and look for other speeches who are in a large number of the same
        # buckets. O(nd), where d is the maximimum number of plateaus in any
        # bucket of any group.

        for speech in speech_set.speeches:
            # Skip speeches without plateaus.
            if speech.plateau_fingerprint is None:
                continue

            number_buckets = len(speech.near_fingerprint_buckets)

            match_counter = counter.Counter()
            for bucket in speech.plateau_fingerprint_buckets:
                match_counter.update(bucket)

            for (match, count) in match_counter.most_common():
                # Don't want to match ourself!
                if speech == match:
                    continue

                if (number_buckets - count) > similarity_distance:
                    # No more matches will be close enough.
                    break

                # Order the pair to avoid duplicates.
                if speech.id < match.id:
                    overlapping_speeches.add((speech.id, match.id))
                else:
                    overlapping_speeches.add((match.id, speech.id))
    else:
        # Brute force search of the plateaus.
        pairs = itertools.combinations(speech_set.speeches, 2)
        for (a, b) in pairs:
            if a.plateau_fingerprint is None or b.plateau_fingerprint is None:
                continue

            distance = hamming_distance(a.plateau_fingerprint,
                                        b.plateau_fingerprint)
            if distance <= similarity_distance:
                overlapping_speeches.add((a.id, b.id))

    return overlapping_speeches
Beispiel #7
0
    def __init__(self, featurizer, classer, classes):
        """
        Create a new classifier.

        Args:
        featurizer: A function argument that is given a data sample and returns
                    a list of features, which cannot ever be empty.
        classer: A function that returns the class of a data sample.
        classes: The possible classes the classifier will see. Since we might
                 not see all classes in the data, this needs to be explicitly
                 specified.
        """
        self.featurizer = featurizer
        self.classer = classer
        self.classes = classes

        self.class_counts = counter.Counter()
        self.feature_counts = collections.defaultdict(lambda: counter.Counter())
        self.class_to_feature_counts = counter.Counter()
Beispiel #8
0
    def __init__(self):
        """
        This starts the Tk framework up, instantiates the Model (a Counter object), instantiates the View (a MyFrame object), and starts the event loop that waits for the user to press a Button on the View.
        """

        root = tkinter.Tk()
        self.model = counter.Counter()
        self.view = myFrame.MyFrame(self)
        self.view.mainloop()
        root.destroy()
Beispiel #9
0
 def count(self):
     for file in self.files:
         try:
             file_type, line_count = counter.Counter(file).result
             try:
                 self.line_counts[file_type] += line_count
             except KeyError:
                 self.line_counts[file_type] = line_count
         except ValueError:
             # Not a text file
             continue
Beispiel #10
0
 def __init__(self, logfile):
     self.logfile = logfile
     self.url = []
     self.parent_url = []
     self.is_seed = []
     self.hop = []
     self.crawl_date = []
     self.content_type = []
     self.anchor_text = []
     self.nline = {'total': 0, 'skipped': 0, 'parsed': 0}
     self.counters = counter.Counter()
     self.counters.newCounter('Accepted')
     self.counters.newCounter('Accepted_isSeed')
Beispiel #11
0
  def best_tfidf(self):
    """Find the best word according to tf.idf.

    Not used for reasons explained in the report."""

    for (i, email) in enumerate(self.emails):
      print "\t%s" % i
      email.tfidf = counter.Counter()
      for word in email.words_counter:
        tf_d = email.words_counter[word]
        df = len(self.inverted_index[word])
        idf = math.log(self.number_emails / float(df))
        squasher = float(2 * email.length) / self.avg_length()
        score = (tf_d / (tf_d + squasher)) * idf

        email.tfidf[word] = score

    overall_tfidfs = counter.Counter()
    for email in self.emails:
      overall_tfidfs += email.tfidf

    return overall_tfidfs.most_common(1)[0][0]
class TestCounter(unittest.TestCase):
    """Tests the Counter class methods.
    Requires a redis running on localhost:6379 
    """

    # Initiate a Counter instance with connection to a local redis
    c = counter.Counter('localhost', 6379, 0, 'count', os.getenv("REDIS_PASS"))

    def test_redis_connection(self):
        """Tests if c can connect to the redis
        """

        resp = self.c.r.execute_command('ping')
        self.assertEqual(resp, b'PONG')

    def test_get(self):
        """Tests Counter.get()
        """

        # Test if method returns 0 if value does not exist
        self.c.r.delete('count')
        self.assertEqual(0, self.c.get(),
                         "get() did not return 0 on none existing key")

        # Test if method returns the value set in redis
        self.c.r.set('count', 5)
        self.assertEqual(5, self.c.get(),
                         "get() did not return the correct value")

    def test_incr(self):
        """Tests Counter.incr()
        """

        # clear the redis key
        self.c.r.delete('count')

        # test that non-existent key is initialized
        self.c.incr()
        self.assertEqual(b'1', self.c.r.get('count'),
                         "incr() did not initialized the correct value")

        # test that the values is increased by amount
        amount = 5
        ival = int(self.c.r.get('count'))
        rval = self.c.incr(amount)
        cval = int(self.c.r.get('count'))
        self.assertEqual(amount, cval - ival,
                         "incr() did not increase the value correctly")
        self.assertEqual(cval, rval,
                         "incr() did not return the resulting value")
 def classify(self, data):
     """
 Classifies each datum as the label that most closely matches the prototype vector
 for that label.  See the project description for details.
 
 Recall that a datum is a counter.counter... 
 """
     guesses = []
     for datum in data:
         vectors = counter.Counter()
         for l in self.legalLabels:
             vectors[l] = self.weights[l] * datum
         guesses.append(vectors.argMax())
     return guesses
Beispiel #14
0
def _calculate_overlap(query, inverted_index):
  """Calculate the overlaps between a query and all documents.

  An inverted index is used so that only the documents containing
  the query words are examined."""

  # Counts the number of each document in the inverted index for each
  # word in the query. That is, if the query was "bob marley" and we
  # had {"bob" -> [d1, d4, d6], "marley" -> [d2, d4, d5]}, the result
  # would be a counter of {d1:1, d2:1, d4:2, d5:1, d6:1}.
  document_overlaps = counter.Counter()
  for word in query.words_counter:
    document_overlaps.update(inverted_index[word])

  return [(query, d, score) for (d, score) in document_overlaps.most_common()]
def basicFeatureExtractorDigit(datum):
    """
    Returns a set of pixel features indicating whether
    each pixel in the provided datum is white (0) or gray/black (1)
    """
    a = datum.getPixels()

    features = counter.Counter()
    for x in range(DIGIT_DATUM_WIDTH):
        for y in range(DIGIT_DATUM_HEIGHT):
            if datum.getPixel(x, y) > 0:
                features[(x,y)] = 1
            else:
                features[(x,y)] = 0
    return features
def basicFeatureExtractorFace(datum):
    """
    Returns a set of pixel features indicating whether
    each pixel in the provided datum is an edge (1) or no edge (0)
    """
    a = datum.getPixels()

    features = counter.Counter()
    for x in range(FACE_DATUM_WIDTH):
        for y in range(FACE_DATUM_HEIGHT):
            if datum.getPixel(x, y) > 0:
                features[(x,y)] = 1
            else:
                features[(x,y)] = 0
    return features
def main(confc):
    logger = logging.getLogger("main")

    #conf = Config_mascrawlloggen()

    # create counters
    counters = counter.Counter()

    # create output file
    fcsv = open(confc["outputfile"],'w')
    csvwriter = csv.writer(fcsv,quoting=csv.QUOTE_ALL)
    header = ("id","fullpath","url","parenturl","datetime","hop","content_type")
    csvwriter.writerow(header)

    # work through the download directory for all .txt files
    pdfi = 0
    logger.info("traversing data directory")
    for root,dirs,files in os.walk(confc["datadir"]):
        for f in files:
            if f.endswith('.txt'):
                url_fetched = get_url_fetched(os.path.join(root,f))
                if not url_fetched: continue
                pdfpath = os.path.join(root,os.path.splitext(f)[0]+'.pdf')
                print_prog(pdfi,len(files),step=1000,comments=pdfpath)

                if not os.path.exists(pdfpath):
                    logger.warning("pdf file not found: "+pdfpath)
                    continue
                counters.addCounter('all')
                pdfi += 1

                # first try to get date time from file name
                # if not work, try to get it from the file itself
                datetimestr = f.split("_")[0]
                try:
                    dt = datetime.strptime(datetimestr,"%Y-%m-%d-%H-%M-%S")
                except ValueError:
                    (f_mode, f_ino, f_dev, f_nlink, f_uid, f_gid, f_size, f_atime, f_mtime, f_ctime) = os.stat(pdfpath)
                    f_mtime = time.ctime(f_mtime)
                    dt = datetime.strptime(f_mtime,"%a %b %d %H:%M:%S %Y")
                parenturl = ""
                hop = 0
                content_type = 'application/pdf'
                outline = (pdfi,pdfpath,url_fetched,parenturl,str(dt),hop,content_type)
                csvwriter.writerow(outline)
            
    fcsv.close()
    logger.info("file output to: "+fcsv.name)
def main():
    # The following causes the constructor to be invoked:  __init__()
    score_counter = counter.Counter()  # Create a counter object
    print(score_counter.get_count())  # Display the current value of counter
    for points in range(10):  # Increment the counter 10 times
        score_counter.increment()
    print(score_counter.get_count())  # Display the current value of counter
    for points in range(5):  # Decrement the counter 5 times
        score_counter.decrement()
    print(score_counter.get_count())  # Display the current value of counter
    score_counter.reset()  # Reset the counter
    print(score_counter.get_count())  # Display the current value of counter
    # The following causes the 'to string' method to be invoked:  __str__()
    print(score_counter)  # Displays str representation of counter
    score_counter.set_count(100)  # Set counter to 100
    print(score_counter)  # Displays str representation of counter
Beispiel #19
0
def main():
    """Creates a graph.dot file with interesting information."""

    print "Parsing wikipedia.txt"
    wiki_words = get_wikipedia_words("data/wikipedia.txt")

    # Parse the aliases.txt file.
    print "Parsing aliases.txt"
    (aliases, inverse_aliases) = get_aliases("data/aliases.txt")

    # Parse the roles.txt file.
    print "Parsing roles.txt"
    employees_info = info.get_employees_map("data/roles.txt", inverse_aliases)

    # Parse the graph.txt file to get the email graph.
    print "Parsing graph.txt"
    (_, email_graph) = graphs.process_file("data/graph.txt", inverse_aliases)
    interesting_nodes = get_interesting_nodes(email_graph, aliases)

    relations = get_relations(interesting_nodes, employees_info, wiki_words)

    # Write the resultant graph.
    print "Writing results."
    with open("graph.dot", "w") as f:
        f.write('digraph G {\n')
        print "%s relations" % len(relations)
        for (i, relation) in enumerate(sorted(relations)):
            print "%s" % i
            a = relation.from_info
            b = relation.to_info

            # Choose the most common word in the email subjects.
            c = counter.Counter()
            for email in relation.emails.emails:
                if email.subject is not None:
                    c.update(email.subject)
            if len(c) > 0:
                best_word = c.most_common(1)[0][0]
            else:
                best_word = ""

            f.write('"%s" -> "%s" [label = "%s"];\n' %
                    (a.description(), b.description(), best_word))
        f.write('}\n')
Beispiel #20
0
def main():
    # Argument parsing
    args = parseArguments()

    # Counter class, holds all 4 counters
    counter = c.Counter()

    # Parse inputted weights for generator
    weights = [args.__dict__[chr(97 + x)] / 26 for x in range(0, 26)]

    # Initializing and generating text
    gen = gt.GenText()
    dataset = gen.generate(args.letters, weights)
    # All 4 counters plus the exact counter
    ec = counter.ExactCount(dataset)
    mg = counter.MisraGries(dataset, args.kappa)
    mm = counter.MankuMotwani(dataset, args.kappa)
    m = counter.Metwally(dataset, args.kappa)
    cms = counter.Count_Min_Sketch(dataset, args.error, args.factor)
    smallest = 3

    print("Exact Count")
    print((sorted(ec[0][:smallest], reverse=True), ec[1], ec[2]))

    print("MisraGries Score")
    print(scoreCalc(ec, mg, smallest))
    print("Accuracy: " + str(accuracy(ec[0], mg[0], smallest)) + "%")
    print((mg[0][:smallest], mg[1], mg[2]))

    print("MankuMotwani Score")
    print(scoreCalc(ec, mm, smallest))
    print("Accuracy: " + str(accuracy(ec[0], mm[0], smallest)) + "%")
    print((mm[0][:smallest], mm[1], mm[2]))

    print("Metwally Score")
    print(scoreCalc(ec, m, smallest))
    print("Accuracy: " + str(accuracy(ec[0], m[0], smallest)) + "%")
    print((m[0][:smallest], m[1], m[2]))

    print("Count_Min_Sketch Score")
    print(scoreCalc(ec, cms, smallest))
    print("Accuracy: " + str(accuracy(ec[0], cms[0], smallest)) + "%")
    print((cms[0][:smallest], cms[1], cms[2]))
 def run(self):
     """
     This function doing something depending on users' choice.
     """
     account = counter.Counter()
     account.set_operations(self.get_data_from_file())
     i = 0
     while i != 9:
         i = self.user_input(lambda: input("1 - Show operations history \n"
                                           "2 - Add new operation \n"
                                           "3 - Get operations by money \n"
                                           "4 - Get operation by description \n"
                                           "5 - Get operation by date \n"
                                           "6 - Get balance \n"
                                           "7 - Clear operations history \n"
                                           "8 - Change file \n"
                                           "9 - Exit \n"))
         if i == 1:
             for i in account:
                 print(i)
         elif i == 2:
             account.add_operation(self.get_new_operation())
             self.save_data_into_file(account.get_operations())
         elif i == 3:
             for k in account.get_operations_by_money(
                     int(self.user_input(
                         lambda: input("Enter money value = ")))):
                 print(k)
         elif i == 4:
             for k in account.get_operations_by_description(
                     input("Enter description value: ")):
                 print(k)
         elif i == 5:
             for k in account.get_operations_by_date(self.get_datetime()):
                 print(k)
         elif i == 6:
             print("balance = " + str(account.get_balance()))
         elif i == 7:
             account.delete_operations()
         elif i == 8:
             account.set_operations(self.get_data_from_file())
         elif i != 9:
             print("Wrong choice, try again")
Beispiel #22
0
def main():

    pc = ProgressChecker()
    c = counter.Counter()
    totalsize = 0
    bytype = dict()
    states = dict()
    processed_states = dict()

    for d in argv[1:]:
        todo = 0
        for ftmp in os.listdir(d):
            if string.find(ftmp, ".zip") >= 0:
                f = d + "/" + ftmp
                if pc.isFinished(f):
                    processed_states[os.path.basename(d)] = 1
                else:
                    nr, nrbytype = c.count(f)
                    todo += nr
                    k = os.path.basename(d)
                    if states.has_key(k):
                        states[k] += nr
                    else:
                        states[k] = nr
                    for k in nrbytype.keys():
                        if bytype.has_key(k):
                            bytype[k] += nrbytype[k]
                        else:
                            bytype[k] = nrbytype[k]
        if todo > 0:
            totalsize = totalsize + todo
            print str(todo) + ": " + d
    print "Have about " + str(totalsize) + " records to add."
    a = bytype.keys()
    a.sort()
    for k in a:
        print "Records of type " + k + ":  " + str(bytype[k])
    print "Have not added any records for the following states:"
    a = states.keys()
    a.sort()
    for k in a:
        if not processed_states.has_key(k):
            print "\t" + k + "\t" + str(states[k])
Beispiel #23
0
def startup(verbal=False):
    # initial setups
    # "crawldir" contains all crawled documents including ".pdf" and ".txt" files
    crawldir = '/msa/crawl/directory/crawler-out'
    # "outputlogpath" is the output file path
    outputlogpath = "/msa/crawl/directory/crawl.log"

    # open output file
    fout = open(outputlogpath, 'w')

    # create counters
    counters = counter.Counter()
    counters.newCounter('txt')
    counters.newCounter('pdf')

    # walk through the crawl directory for all .txt files
    for root, dirs, files in os.walk(crawldir):
        for f in files:
            if f.endswith('.txt'):
                counters.addCounter('txt')
                # load the txt file
                lines = file(os.path.join(root, f)).readlines()
                for line in lines:
                    line = line.strip('\n')
                    if line.startswith("Fetched"):
                        url_fetched = line[9:]
                        break

# write filename and URL into output file
                pdfpath = os.path.splitext(f)[0] + '.pdf'
                lineout = ' '.join([pdfpath, url_fetched]) + '\n'
                fout.write(lineout)
                if verbal: print lineout
            elif f.endswith('.pdf'):
                counters.addCounter('pdf')
            else:
                print 'unknown file extension: ', file

    fout.close()
    counters.setCounter('all', counters.txt + counters.pdf)
    counters.printCounter()
    print 'file output to: ' + outputlogpath
Beispiel #24
0
def get_all_possibilities(word):
	print(word)
	letters = list(word)
	letters_in_TCHSET = map(lambda x : "ENG_"+ x, letters)
	possibilities_in_TCHSET = map(lambda x: eng_to_readablechars[x] if eng_to_readablechars[x] else x, letters_in_TCHSET)
	possibilities_per_slot = map(lambda x: len(x) if x else 1, possibilities_in_TCHSET)
	if possibilities_per_slot == 0:
		return [word]
	possibility_counter = counter.Counter(len(word), possibilities_per_slot)
	generate_word = get_word_generator(possibilities_in_TCHSET)
	all_possibilities = []
	selection_number = possibility_counter.get_curr_value()
	all_possibilities.append(generate_word(selection_number))

	while possibility_counter.can_increment():
		possibility_counter.increment()
		selection_number = possibility_counter.get_curr_value()
		all_possibilities.append(generate_word(selection_number))

	all_possibilities = map(lambda x : "".join(x), all_possibilities)
	return all_possibilities
    def calculateLogJointProbabilities(self, datum):
        """
        Returns the log-joint distribution over legal labels and the datum.
        Each log-probability should be stored in the log-joint counter, e.g.        
        logJoint[3] = <Estimate of log( P(Label = 3, datum) )>
        
        To get the list of all possible features or labels, use self.features and 
        self.legalLabels.
        """
        logJoint = counter.Counter()
        evidence = datum.items()
        "*** YOUR CODE HERE ***"
        for y in self.legalLabels:
            if self.Pr_distribution_of_features[y] != 0:
                logJoint[y] = math.log(self.Pr_distribution_of_features[y])
            else:
                logJoint[y] = 0
            for f in self.conditionals:
                prob = self.conditionals[f][datum[f]][y]
                logJoint[y] += (prob and math.log(prob) or 0.0)

        return logJoint
Beispiel #26
0
  def calculate_similarity(self, query_file, data_file, filename):
    """Calculate the similarity between a query file and a data file.

    The results are written to a file named "filename"."""

    queries_set = doc.DocumentSet(query_file)
    documents_set = doc.DocumentSet(data_file)

    results = []
    for query in queries_set.documents:
      # Compute the initial tfidfs.
      initial_tfidfs = self.tf_idf._tfidf(query, documents_set)

      # Select the top n_d scoring documents.
      initial_tfidfs = sorted([(-s, d) for (_, d, s) in initial_tfidfs])
      initial_tfidfs = [(d, -s) for (s, d) in initial_tfidfs[:self.n_d]]
      selected_docs = [document for (document, _) in initial_tfidfs]

      # Combine the top documents into a 'mega document'.
      summed_counter = counter.Counter(query.words_counter)
      for document in selected_docs:
        summed_counter += document.words_counter
      mega_document = doc.document_from_dict(None, dict(summed_counter))

      # Select the top n_w scoring words (via tf.idf) from the megadocument.
      word_scores = []
      for word in sorted(list(mega_document.words_counter)):
        score = self.tf_idf._document_tfidf(word, mega_document, documents_set)
        word_scores.append((-score, word))
      word_scores = sorted(word_scores)[:self.n_w]
      word_scores = [(word, -score) for (score, word) in word_scores]

      # Use these new words as the next query, and return the tf.idf scores.
      new_query = doc.document_from_dict(query.id, dict(word_scores))
      results.extend(self.tf_idf._tfidf(new_query, documents_set))

    output.write_output_file(filename, results)
Beispiel #27
0
import counter
# import joker
import test_joker

addons = [
    counter.Counter(),
    # joker.Joker(),
    test_joker.Joker()
]
    def train(self, trainingData, trainingLabels, validationData,
              validationLabels):
        """
        Trains the classifier by collecting counts over the training data, and
        stores the Laplace smoothed estimates so that they can be used to classify.
        Evaluate each value of k in kgrid to choose the smoothing parameter 
        that gives the best accuracy on the held-out validationData.
        
        trainingData and validationData are lists of feature Counters.    The corresponding
        label lists contain the correct label for each datum.
        
        To get the list of all possible features or labels, use self.features and 
        self.legalLabels.
        """

        # might be useful in your code later...
        # this is a list of all features in the training set.

        pre_fs = []
        for datum in trainingData:
            for key in datum.keys():
                pre_fs.append(key)

        feature_set = set(pre_fs)
        self.features = list(feature_set)

        #sys.exit(1)

        spgrid = [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 20, 50]

        Pr_distribution_of_features = counter.Counter()

        for label in trainingLabels:
            Pr_distribution_of_features[label] += 1

        Pr_distribution_of_features.normalize()
        self.Pr_distribution_of_features = Pr_distribution_of_features

        # Initialize stuff
        binary_feature_counts = {}
        totals = {}
        for f in self.features:
            binary_feature_counts[f] = {
                0: counter.Counter(),
                1: counter.Counter()
            }
            totals[f] = counter.Counter()

        # Calculate totals and binary feature counts

        # enumerate(thing) returns an iteratur that will return
        # (0, thing[0]), (1, thing[1]), (2, [thing[2]), ...
        for i, datum in enumerate(trainingData):
            y = trainingLabels[i]
            for f, value in datum.items():
                binary_feature_counts[f][value][y] += 1.0
                totals[f][y] += 1.0

        bestConditionals = {}
        bestAccuracy = None
        # Evaluate each k, and use the one that yields the best accuracy
        for sp in spgrid or [0.0]:
            correct = 0
            conditionals = {}
            for f in self.features:
                conditionals[f] = {0: counter.Counter(), 1: counter.Counter()}

            # Run Laplace smoothing
            for f in self.features:
                for value in [0, 1]:
                    for y in self.legalLabels:
                        conditionals[f][value][y] = (
                            binary_feature_counts[f][value][y] +
                            sp) / (totals[f][y] + sp * 2)

            # Check the accuracy associated with this k
            self.conditionals = conditionals
            guesses = self.classify(validationData)
            for i, guess in enumerate(guesses):
                correct += (validationLabels[i] == guess and 1.0 or 0.0)
            accuracy = correct / len(guesses)

            # Keep the best k so far
            if accuracy > bestAccuracy or bestAccuracy is None:
                bestAccuracy = accuracy
                bestConditionals = conditionals
                self.sp = sp

        self.conditionals = bestConditionals
Beispiel #29
0
import os
import socketserver

import counter
import parse_datagram
import pretty_json


COUNTER = counter.Counter()
HOST = "0.0.0.0"
METRICS_PORT = int(os.environ.get("METRICS_PORT", 8125))


class Handler(socketserver.BaseRequestHandler):

    PREFIX_SUFFIX = METRICS_PORT

    def handle(self):
        count = COUNTER.increment()
        # There is no difference between request count / metric count as there
        # is for the trace server.
        prefix = f"{count:03d}-{count:03d}-{self.PREFIX_SUFFIX} | "

        data = self.request[0].strip()
        data_parsed = parse_datagram.parse_metric(data)
        try:
            colorful_json = pretty_json.printable_json(data_parsed, prefix)

            print(f"{prefix}metric =")
            print(colorful_json, end="")
        except:
import counter
import coverage

counter.Counter.connect()
instance = counter.Counter()


def testGet():
    instance.setCount(65535)
    assert instance.getCount() == 65535


def testIncrement():
    instance.setCount(655)
    instance.increment()
    assert instance.getCount() == 656


def testSet():
    instance.setCount(3333333)
    assert instance.getCount() == 3333333


def testNegativeDecrement():
    instance.setCount(0)
    assert instance.decrement() == False


def testComplex():
    instance.setCount(65535)
    instance.increment()