Ejemplo n.º 1
0
def tag_json():
    """
  Tags all the files in the JSON directory
  """
    records = db_util.get_all_records()
    pbar = ProgressBar(widgets=[SimpleProgress()], maxval=len(records)).start()
    for idx, record in enumerate(records):
        record = db_util.open_NASARecord(record)
        # it's OK to concatenate all the text since we're using a
        # bag of words approach
        record_words = set()  # set of all the words & phrases in the record
        for i in range(1, tag_analyzer.MAX_PHRASE_LENGTH + 1):
            word_hash = {}
            tag_analyzer.hash_string(record, i, word_hash)
            record_words = record_words.union(set(word_hash.keys()))

        # now, you have a hash of all the word combinations
        #print "Description:"
        #print record.description
        #print record_words
        found_tags = list(canonical_set.intersection(record_words))
        #print "\n------------------------------------------------"
        #print "Tags found:"
        indicies = [tag_index[canonical_tags[x]] for x in found_tags]
        record.tags = indicies
        record.save()
        # TODO: save back into the file
        # TODO: create a tag_list.txt file which has all the possible tags
        # in a list, so that you don't have to write the whole string into
        # the file
        pbar.update(idx + 1)
    pbar.finish()
Ejemplo n.º 2
0
def tag_json():
  """
  Tags all the files in the JSON directory
  """
  records = db_util.get_all_records()
  pbar = ProgressBar(widgets=[SimpleProgress()], maxval=len(records)).start()
  for idx, record in enumerate(records):
    record = db_util.open_NASARecord(record)
    # it's OK to concatenate all the text since we're using a 
    # bag of words approach
    record_words = set() # set of all the words & phrases in the record
    for i in range(1, tag_analyzer.MAX_PHRASE_LENGTH+1):
      word_hash = {}
      tag_analyzer.hash_string(record, i, word_hash)
      record_words = record_words.union(set(word_hash.keys()))
    
    # now, you have a hash of all the word combinations
    #print "Description:"
    #print record.description
    #print record_words
    found_tags = list(canonical_set.intersection(record_words))
    #print "\n------------------------------------------------"
    #print "Tags found:"
    indicies = [tag_index[canonical_tags[x]] for x in found_tags]
    record.tags = indicies
    record.save()
    # TODO: save back into the file
    # TODO: create a tag_list.txt file which has all the possible tags
    # in a list, so that you don't have to write the whole string into
    # the file
    pbar.update(idx + 1)
  pbar.finish()
Ejemplo n.º 3
0
def find_word_occurences(word_len, out_file):
    json_list = get_input_file_list()
    word_hash = {}
    pbar = ProgressBar(widgets=[SimpleProgress()], maxval=NUM_ENTRIES).start()
    for idx, json in enumerate(json_list):
        json_record = db_util.open_NASARecord(json)
        hash_string(json_record, word_len, word_hash)
        pbar.update(idx + 1)
    pbar.finish()
    output_file(word_hash, out_file)
Ejemplo n.º 4
0
def find_word_occurences(word_len, out_file):
  json_list = get_input_file_list()
  word_hash = {}
  pbar = ProgressBar(widgets=[SimpleProgress()], maxval=NUM_ENTRIES).start()
  for idx,json in enumerate(json_list):
    json_record = db_util.open_NASARecord(json)
    hash_string(json_record, word_len,word_hash)
    pbar.update(idx + 1)
  pbar.finish()
  output_file(word_hash, out_file)
Ejemplo n.º 5
0
def generate_html():
    global tag_list
    tag_list = tag_json.get_tag_list()
    records = db_util.get_all_records()
    htmlfile = open("tags_preview.html", "w")
    htmlfile.write("<html>\n<head>\n")
    htmlfile.write('<link rel="stylesheet" type="text/css" href="preview.css" />')
    htmlfile.write("</head>\n<body>\n")
    pbar = ProgressBar(widgets=[SimpleProgress()], maxval=len(records)).start()
    for idx, record in enumerate(records):
        record = db_util.open_NASARecord(record)
        htmlfile.write("<p>\n")
        record_w_link = '<a href="%s">%s</a>' % (record.med_image, record.record_id)

        category = get_highlighted_text(record.category, record.tags)
        description = get_highlighted_text(record.description, record.tags)

        htmlfile.write(format_div(record_w_link, "record_id"))
        htmlfile.write(format_div(category, "category"))
        htmlfile.write("<br/>")
        htmlfile.write(format_div(description, "description"))
        htmlfile.write("<br/>")
        htmlfile.write(format_div("Tags:", "tag_divider"))
        # TODO: make the tags the same colors

        if len(record.tags) > 0:
            tag_text = ", ".join(format_tag(tag_list[tag_id][0], tag_num) for tag_num, tag_id in enumerate(record.tags))
        else:
            tag_text = "None"
            # tag_text = ", ".join(tag_list[i][0] for i in record.tags)
            # print "canonical tag:",canonical_tag
        htmlfile.write(format_div(tag_text, "tag_text"))
        htmlfile.write("<br/>")

        htmlfile.write("</p>\n")
        pbar.update(idx + 1)
    pbar.finish()
    htmlfile.write("</body></html>\n")