Example #1
0
def map_representation(structured_nps,
                       start_words=None,
                       ranking_algorithm=1,
                       similarity_algorithm=2,
                       filtering_algorithm=1,
                       number_of_terms=1000,
                       simplify_terms=False,
                       model=None,
                       data_dump_path=None):
    """returns a pair similarity dictionary for the map and set of terms in the map. Heatmap can
    be calculated seperately and then overlaid. Will need to convert dictionary representation
    to dot file format"""
    flattened = flatten(structured_nps)
    set_status('ranking terms', model=model)
    if start_words is not None:
        # start words should be a list like ["machine learning", "artificial intelligence"]
        start_words = [tuple(s.split()) for s in start_words]
        ranked_phrases, phrase_frequencies, scored_phrases = call_rank(
            ranking_algorithm,
            flattened,
            number_of_terms,
            start_words=start_words,
            model=model)
    else:
        ranked_phrases, phrase_frequencies, scored_phrases = call_rank(
            ranking_algorithm, flattened, number_of_terms, model=model)
    if simplify_terms:
        structured_nps = simplification.term_replacement(
            structured_nps, ranked_phrases)
    set_status('calculating similarity', model=model)
    sim_matrix, phrase_lookups = call_similarity(
        similarity_algorithm,
        structured_nps,
        ranked_phrases,
        model=model,
        status_callback=lambda s: set_status(s, model=model))
    if data_dump_path:
        import pickle
        from os.path import join

        def prefix_path(rel):
            return join(data_dump_path, rel)

        with open(prefix_path('sim_matrix.pickle'), 'w') as f:
            pickle.dump(sim_matrix, f)
        with open(prefix_path('phrase_lookups.pickle'), 'w') as f:
            pickle.dump(phrase_lookups, f)
        with open(prefix_path('phrase_frequencies.pickle'), 'w') as f:
            pickle.dump(phrase_frequencies, f)
    phrase_pairs = call_filter(filtering_algorithm,
                               sim_matrix,
                               phrase_lookups,
                               model=model)
    normed = similarity.similarity_dict_to_distance(phrase_pairs)
    # build set of terms in graph
    graph_terms = set()
    for term, lst in normed.items():
        graph_terms.add(term)
        graph_terms.update(term for term, val in lst)
    return normed, graph_terms, phrase_frequencies, phrase_pairs, scored_phrases
Example #2
0
def handle_status(config, args):
    if args['clear']:
        clear_status(config.slack, config.default_statuses, config.default_dnd)
    elif args['set']:
        try:
            status = config.statuses[args['<status>']]
        except KeyError:
            print(
                f'{args["<status>"]} is not a vaild status. Valid statuses are:'
            )
            print_statuses_list(config.statuses)
            exit(1)

        set_status(config.slack, status, args['<time>'])
    elif args['show']:
        if args['<status>'] is None:
            print(get_status(config.slack))
        else:
            try:
                print(config.statuses.get(args['<status>']))
            except KeyError:
                print(
                    f'{args["<status>"]} is not a vaild status. Valid statuses are:'
                )
                print_statuses_list(config.statuses)
                exit(1)

    elif args['list']:
        print_statuses_list(config.statuses)
Example #3
0
    def acquire_status(self, desired_status_string):
        # Don't need to do anything if we already have the status !!
        if mpc_status.get_status("mpc_temp_status") != desired_status_string:

            try:

                # wait to acquire lock from parallel workers
                with self.lock.acquire(timeout=timeout):

                    # wait to aquire lock from any other code
                    # e.g. PV uses this during identification/linking
                    while mpc_status.get_status(
                            "mpc_temp_status") != desired_status_string:

                        time.sleep(np.random.rand() * 0.01)

                        if mpc_status.get_status("mpc_temp_status") == '':
                            mpc_status.set_status("mpc_temp_status",
                                                  desired_status_string)

                        time.sleep(np.random.rand() * 0.01)

                    assert mpc_status.get_status("mpc_temp_status") == desired_status_string, \
                        f'Problem: mpc_temp_status = {mpc_status.get_status("mpc_temp_status")}'

            except Exception as e:
                print('Problem with *aquire_status()*')
                print(e)
                print('\t:', desired_status_string)

        return mpc_status.get_status("mpc_temp_status")
Example #4
0
def close_session(handle: LogHandle):
    """
    Closes session and csvfile on disk
    """
    if handle and handle.file:
        print('closing logging session for file:', handle.file.name)
        status.set_status('close logging session with file: {}'.format(
            handle.file.name))
        handle.file.close()
Example #5
0
    def change_port(self, port: serial.Serial):

        if port and self.serial and port != self.serial.port:
            # reader thread needs to be shut down
            self._stop_reader()
        self.serial = serial.Serial(port.device, BAUDRATE, timeout=10)
        self.sendCmd('reset')
        print('open port: ', self.serial, self.serial.port)
        status.set_status('open port: {}'.format(self.serial.port))
        self._start_reader()
Example #6
0
def call_filter(filter_index, sim_matrix, phrases, top_limit_override=None, model=None):
    """
    filter_index: 0 = Top; 1 = Pull in Lesser Terms; 2 = Take Top and Fill w/ Lesser
    """
    filtering_fn = filtering_fns[filter_index]
    set_status('filtering and getting pairwise with %s' % filtering_fn, model=model)
    if top_limit_override:
        phrase_pairs = filtering_fn(sim_matrix, phrases, top_limit=top_limit_override)
    else:
        phrase_pairs = filtering_fn(sim_matrix, phrases)
    return phrase_pairs
Example #7
0
def call_similarity(similarity_index, structured_nps, phrases, model=None, status_callback=None):
    """
    similarity_index: 0 = LSA (w/ Cosine similarity); 1 = Jaccard; 2 = Jaccard (partial match); 3 = Distributional similarity (w/ Jensen-Shannon divergence)
    """
    # similarity_fns = [similarity.lsa, similarity.jaccard_full, similarity.jaccard_partial]
    similarity_fn = similarity_fns[similarity_index]
    set_status('calculating similarity with %s' % similarity_fn, model=model)
    sim_matrix, phrases = similarity_fn(structured_nps, phrases, status_callback=status_callback)
    # with open('/tmp/sim.pickle', 'w') as f:
    #     pickle.dump(sim_matrix, f)
    return sim_matrix, phrases
Example #8
0
def start_new_session(directory, file_prefix: str, csv: bool):
    """
    Parameters
    ----------
    directory: str
        The directory files will be logged to. If it does not exists,
        logger tries to create the directory

    file_prefix: str
        The prefix in the file name.

        e.g. for 'somefile' the filename will be 'somefile_i.csv'
        with i the sequence number of the logging session.

        The logger will check what the latest session with 'file_prefix' in
        'directory' was and choose the next value for 'i'

    Returns
    -------
    logging_handle: LogHandle
        use this handle for writing data during the session

    None if there was a problem
    """

    ext = 'csv' if csv else 'txt'
    p = Path(directory)
    if not p.exists():
        try:
            p.mkdir(parents=True)
        except Exception as e:
            print('error creating log dir: \n', e)
            return None

    files = list(p.glob(file_prefix + '*' + ext))
    max_i = -1
    for f in files:
        try:
            i = int(f.stem[len(file_prefix + '_'):])
            max_i = max(i, max_i)
        except ValueError as e:
            print(e)
    try:
        fname = p / '{}_{}.{}'.format(file_prefix, max_i + 1, ext)
        csvfile = open(str(fname), 'w', newline='')
        writer = csv.writer(csvfile, delimiter=',') if csv else None
        print('starting new logging session with file:', str(fname))
        status.set_status('starting new logging session with file: {}'.format(
            str(fname)))
        return LogHandle(file=csvfile, writer=writer)
    except Exception as e:
        print('error opening file: \n', e)

    return None
Example #9
0
def call_graphviz(map_string, file_format='svg', model=None):
    """map_string should be a string in the dot file format, which the pipeline will be called on. Output in format file_format"""
    set_status('drawing graph', model=model)
    gv_command = graphviz_command(file_format=file_format, **GRAPHVIZ_PARAMS)
    proc = Popen('echo $PATH', stdout=PIPE, shell=True)
    print "path:", proc.communicate(input='')[0]
    proc = Popen(gv_command, stdout=PIPE, stdin=PIPE, shell=True)
    map_out, map_err = proc.communicate(input=map_string)
    print "return code:", proc.returncode
    if map_err:
        print map_err
    return map_out
Example #10
0
def pop_default(config):
    try:
        config.default_statuses.pop()
    except IndexError:
        pass

    config.write_config()

    if len(config.default_statuses) > 0:
        set_status(config.slack, config.default_statuses[-1])
    else:
        clear_status(config.slack)
Example #11
0
def call_graphviz(map_string, file_format='svg', model=None):
    """map_string should be a string in the dot file format, which the pipeline will be called on. Output in format file_format"""
    set_status('drawing graph', model=model)
    gv_command = graphviz_command(file_format=file_format, **GRAPHVIZ_PARAMS)
    proc = Popen('echo $PATH', stdout=PIPE, shell=True)
    print "path:", proc.communicate(input='')[0]
    proc = Popen(gv_command, stdout=PIPE, stdin=PIPE, shell=True)
    map_out, map_err = proc.communicate(input=map_string)
    print "return code:", proc.returncode
    if map_err:
        print map_err
    return map_out
Example #12
0
def call_rank(ranking_index, flattened, n_large, start_words=[], model=None):
    """ranking_index: 0 = TFIDF; 1 = C-value; 2 = C-value + Unigrams; 3 = TF"""
    ranking_fn = ranking_fns[ranking_index]
    ranking_fn_name = ranking_fn_names[ranking_index]
    set_status('ranking with %s' % ranking_fn_name, model=model)
    if debug:
        print 'ranking with %s' % ranking_fn_name
    scored_phrases, phrase_frequencies = ranking_fn(flattened)
    set_status('ordering', model=model)
    if debug:
        print 'ordering'
    ordered_phrases = sorted(scored_phrases.iteritems(),
                             key=lambda p: p[1],
                             reverse=True)
    #    ordered_fname ='../phrase_lists/%s.phrases' % ranking_index
    #    print 'writing ordered phrases to file %s' % ordered_fname
    #    with open(ordered_fname, 'w') as f:
    #        for o in ordered_phrases[:n_large]:
    #            f.write('%s\n' % str(o))
    if debug:
        print 'mapping'
    ranked_phrases = [p[0] for p in ordered_phrases]

    if debug:
        print 'trimming large'
    large_phrases = ranked_phrases[:n_large]

    if start_words:
        if debug:
            print 'looking for start words', start_words
        found_start_words = []
        for start_word in start_words:
            matches = (ranked_phrase for ranked_phrase in ranked_phrases
                       if start_word in sub_lists(ranked_phrase, proper=False))
            try:
                word = matches.next()
                if word not in large_phrases:
                    found_start_words.append(word)
            except StopIteration:
                if debug:
                    print 'start word %s not found' % start_word
            if debug:
                print 'found start words', found_start_words

        top_phrases = found_start_words + large_phrases
    else:
        top_phrases = large_phrases

    filtered_frequencies = dict(
        (phrase, freq) for (phrase, freq) in phrase_frequencies.items()
        if phrase in top_phrases)
    return top_phrases, filtered_frequencies, scored_phrases
Example #13
0
    def relinquish_status(self, desired_status_string):

        # Only attempt to change if the temp_status is what you think it is ...
        if mpc_status.get_status("mpc_temp_status") == desired_status_string:

            # wait to acquire lock from parallel workers
            with self.lock.acquire(timeout=timeout):

                # set empty status
                mpc_status.set_status("mpc_temp_status", "")
                return True
        else:
            return False
Example #14
0
def call_rank(ranking_index, flattened, n_large, start_words=[], model=None):
    """ranking_index: 0 = TFIDF; 1 = C-value; 2 = C-value + Unigrams; 3 = TF"""
    ranking_fn = ranking_fns[ranking_index]
    ranking_fn_name = ranking_fn_names[ranking_index]
    set_status('ranking with %s' % ranking_fn_name, model=model)
    if debug:
        print 'ranking with %s' % ranking_fn_name
    scored_phrases, phrase_frequencies = ranking_fn(flattened)
    set_status('ordering', model=model)
    if debug:
        print 'ordering'
    ordered_phrases = sorted(scored_phrases.iteritems(),
                             key=lambda p: p[1], reverse=True)
#    ordered_fname ='../phrase_lists/%s.phrases' % ranking_index
#    print 'writing ordered phrases to file %s' % ordered_fname
#    with open(ordered_fname, 'w') as f:
#        for o in ordered_phrases[:n_large]:
#            f.write('%s\n' % str(o))
    if debug:
        print 'mapping'
    ranked_phrases = [p[0] for p in ordered_phrases]

    if debug:
        print 'trimming large'
    large_phrases = ranked_phrases[:n_large]

    if start_words:
        if debug:
            print 'looking for start words', start_words
        found_start_words = []
        for start_word in start_words:
            matches = (ranked_phrase for ranked_phrase in ranked_phrases if start_word in sub_lists(ranked_phrase, proper=False))
            try:
                word = matches.next()
                if word not in large_phrases:
                    found_start_words.append(word)
            except StopIteration:
                if debug:
                    print 'start word %s not found' % start_word
            if debug:
                print 'found start words', found_start_words

        top_phrases = found_start_words + large_phrases
    else:
        top_phrases = large_phrases

    filtered_frequencies = dict((phrase, freq) for (phrase, freq) in phrase_frequencies.items() if phrase in top_phrases)
    return top_phrases, filtered_frequencies, scored_phrases
Example #15
0
def call_similarity(similarity_index,
                    structured_nps,
                    phrases,
                    model=None,
                    status_callback=None):
    """
    similarity_index: 0 = LSA (w/ Cosine similarity); 1 = Jaccard; 2 = Jaccard (partial match); 3 = Distributional similarity (w/ Jensen-Shannon divergence)
    """
    # similarity_fns = [similarity.lsa, similarity.jaccard_full, similarity.jaccard_partial]
    similarity_fn = similarity_fns[similarity_index]
    set_status('calculating similarity with %s' % similarity_fn, model=model)
    sim_matrix, phrases = similarity_fn(structured_nps,
                                        phrases,
                                        status_callback=status_callback)
    # with open('/tmp/sim.pickle', 'w') as f:
    #     pickle.dump(sim_matrix, f)
    return sim_matrix, phrases
Example #16
0
def call_filter(filter_index,
                sim_matrix,
                phrases,
                top_limit_override=None,
                model=None):
    """
    filter_index: 0 = Top; 1 = Pull in Lesser Terms; 2 = Take Top and Fill w/ Lesser
    """
    filtering_fn = filtering_fns[filter_index]
    set_status('filtering and getting pairwise with %s' % filtering_fn,
               model=model)
    if top_limit_override:
        phrase_pairs = filtering_fn(sim_matrix,
                                    phrases,
                                    top_limit=top_limit_override)
    else:
        phrase_pairs = filtering_fn(sim_matrix, phrases)
    return phrase_pairs
Example #17
0
def add_default(config, status, time):
    try:
        s = config.statuses[status]
    except KeyError:
        print(f'{status} is not a vaild status. Valid statuses are:')
        print_statuses_list(config.statuses)
        exit(1)

    try:
        t = Expiration.from_timestamp(time)
    except TimeFormatError as err:
        print(err)
        exit(1)

    s.status_expiration = t

    config.default_statuses.append(s)
    config.write_config()
    set_status(config.slack, s)
Example #18
0
def filter_query(query, dirty=False, starting_year=None, ending_year=None, sample_size=None, model=None):
    filtered = query
    if not dirty:
        filtered = query.filter(Grant.clean == True)
    if ending_year is not None:
        filtered = filtered.filter(Grant.published_year <= ending_year)
    if starting_year is not None:
        filtered = filtered.filter(Grant.published_year >= starting_year)
    if model is not None:
        documents_in_set = filtered.count()
        model.documents_in_set = documents_in_set
        set_status("%d documents met filtering criteria" % documents_in_set)
    if sample_size is not None:
        filtered = filtered.order_by(func.rand()).limit(sample_size)
    if model is not None:
        documents_sampled = filtered.count()
        model.documents_sampled = documents_sampled
        set_status("%d documents were sampled" % documents_sampled)
    return filtered
Example #19
0
def main():
    s = socket.socket()
    ai = socket.getaddrinfo("0.0.0.0", 80)
    print("Bind address info:", ai)
    addr = ai[0][-1]

    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    s.bind(addr)
    s.listen(5)
    print("Listening")

    while True:
        res = s.accept()
        client_sock = res[0]
        client_addr = res[1]
        print("Client address:", client_addr)
        print("Client socket:", client_sock)

        req = client_sock.recv(128)
        cmd = get_command(req)

        print("Request:")
        print(cmd)
        print()

        if (cmd == "/status"):
            client_sock.write(CONTENT % status.status)
        elif (cmd == "/on"):
            status.set_status(1)
            client_sock.write(CONTENT % status.status)
        elif (cmd == "/off"):
            status.set_status(0)
            client_sock.write(CONTENT % status.status)
        elif (cmd == "/c"):
            client_sock.write(CONTENT % status.color_hex)
        elif (cmd[:3] == "/c/"):
            status.set_color(cmd[3:])
            client_sock.write(CONTENT % "OK")
        else:
            client_sock.write(CONTENT % "OK")

        client_sock.close()
Example #20
0
    def __init__(self):
        super(PianoApp, self).__init__()

        self.SerialConnection = SerialConnection()

        self.window = MainWindow()
        self.window.setWindowTitle("Piano Sensor")

        self.window.show()
        signal.signal(signal.SIGINT, self.window.quit)
        self.window.closeSignal.connect(self.quit)

        self.toolbar = QtWidgets.QToolBar()
        self.window.addToolBar(self.toolbar)
        self.mainView = MainView(self.toolbar,
                                 self.SerialConnection.getDropdownWidget())
        self.window.setCentralWidget(self.mainView)

        self.mainView.refresh.connect(self.SerialConnection.refresh)

        self.mainView.resetEncoders.connect(
            lambda: self.SerialConnection.sendCmd('reset'))
        self.mainView.resetSystem.connect(
            lambda: self.SerialConnection.sendCmd('sysreset'))
        self.mainView.getPositions.connect(
            lambda: self.SerialConnection.sendCmd('pos'))

        self.parser = SerialParser()
        self.SerialConnection.textStream.connect(self.parser.parse_line)
        # self.SerialConnection.textStream.connect(self.mainView.textOutputView.addText)

        self.parser.comment.connect(self.mainView.textOutputView.addComment)

        self.parser.newDataSet.connect(
            lambda i, t, p: self.mainView.resultsView.new_results(
                KeyPress(i, t, p)))
        self.parser.newDataSet.connect(
            lambda i, t, p: self.mainView.textOutputView.new_results(
                KeyPress(i, t, p)))

        status.set_status_logger(self.set_status_message)
        status.set_status('Piano Sensor Ready..')
Example #21
0
def filter_query(query, dirty=False, starting_year=None, ending_year=None,
                 sample_size=None, model=None):
    filtered = query
    if not dirty:
        filtered = query.filter(Grant.clean == True)
    if ending_year is not None:
        filtered = filtered.filter(Grant.published_year <= ending_year)
    if starting_year is not None:
        filtered = filtered.filter(Grant.published_year >= starting_year)
    if model is not None:
        documents_in_set = filtered.count()
        model.documents_in_set = documents_in_set
        set_status('%d documents met filtering criteria' % documents_in_set)
    if sample_size is not None:
        filtered = filtered.order_by(func.rand()).limit(sample_size)
    if model is not None:
        documents_sampled = filtered.count()
        model.documents_sampled = documents_sampled
        set_status('%d documents were sampled' % documents_sampled)
    return filtered
Example #22
0
def make_basemap(basemap):
    try:
        set_status('getting document list', model=basemap)
        with ManagedSession() as session:
            filtered_query = create_query_for_model(session, basemap, dirty=False)
            extracted_terms = extract_terms(filtered_query, basemap.term_type)
        if not extracted_terms:
            raise Exception('No documents found matching query!')
        map_dict, graph_terms, phrase_frequencies, unnormed_dict, phrase_scores = map_representation(extracted_terms,
                                                                                                     ranking_algorithm=basemap.ranking_algorithm,
                                                                                                     similarity_algorithm=basemap.similarity_algorithm,
                                                                                                     filtering_algorithm=basemap.filtering_algorithm,
                                                                                                     number_of_terms=basemap.number_of_terms,
                                                                                                     model=basemap)
        # map_string will be a graphviz-processable string
        # map_string = write_dot.output_pairs_dict(map_dict, True, phrase_frequencies=phrase_frequencies, true_scaling=True).decode('ascii', 'ignore')
        map_string = write_dot.output_pairs_dict(map_dict, True, phrase_frequencies=phrase_frequencies, true_scaling=True, similarities=unnormed_dict, phrase_scores=phrase_scores).decode('ascii', 'ignore')
        # save to database
        basemap.dot_rep = map_string
        # basemap.phrase_frequencies = json.dumps(jsonize_phrase_dict(phrase_frequencies), indent=4).decode('ascii', 'ignore')
        # get phrases as a list of lists of strings (one list of words per term)
        basemap.phrases_in_map = json.dumps(jsonize_phrase_set(graph_terms, None)).decode('ascii', 'ignore')
        basemap.save()
        svg_str, width, height = strip_dimensions(call_graphviz(map_string, file_format='svg', model=basemap))
        basemap.svg_rep = svg_str
        basemap.width = width
        basemap.height = height
        basemap.finished = True
        basemap.save()
        set_status('basemap complete', model=basemap)
        print 'basemap complete'
        return map_dict, graph_terms
    except ZeroDivisionError as e:
        set_status('Error: too few documents to produce a map. Try a broader search', model=basemap)
Example #23
0
def map_representation(structured_nps, start_words=None, ranking_algorithm=1,
                       similarity_algorithm=2, filtering_algorithm=1,
                       number_of_terms=1000, simplify_terms=False, model=None,
                       data_dump_path=None):
    """returns a pair similarity dictionary for the map and set of terms in the map. Heatmap can
    be calculated seperately and then overlaid. Will need to convert dictionary representation
    to dot file format"""
    flattened = flatten(structured_nps)
    set_status('ranking terms', model=model)
    if start_words is not None:
        # start words should be a list like ["machine learning", "artificial intelligence"]
        start_words = [tuple(s.split()) for s in start_words]
        ranked_phrases, phrase_frequencies, scored_phrases = call_rank(ranking_algorithm, flattened, number_of_terms, start_words=start_words, model=model)
    else:
        ranked_phrases, phrase_frequencies, scored_phrases = call_rank(ranking_algorithm, flattened, number_of_terms, model=model)
    if simplify_terms:
        structured_nps = simplification.term_replacement(structured_nps, ranked_phrases)
    set_status('calculating similarity', model=model)
    sim_matrix, phrase_lookups = call_similarity(similarity_algorithm, structured_nps, ranked_phrases, model=model, status_callback=lambda s: set_status(s, model=model))
    if data_dump_path:
        import pickle
        from os.path import join
        def prefix_path(rel):
            return join(data_dump_path, rel)
        with open(prefix_path('sim_matrix.pickle'), 'w') as f:
            pickle.dump(sim_matrix, f)
        with open(prefix_path('phrase_lookups.pickle'), 'w') as f:
            pickle.dump(phrase_lookups, f)
        with open(prefix_path('phrase_frequencies.pickle'), 'w') as f:
            pickle.dump(phrase_frequencies, f)
    phrase_pairs = call_filter(filtering_algorithm,  sim_matrix, phrase_lookups, model=model)
    normed = similarity.similarity_dict_to_distance(phrase_pairs)
    # build set of terms in graph
    graph_terms = set()
    for term, lst in normed.items():
        graph_terms.add(term)
        graph_terms.update(term for term, val in lst)
    return normed, graph_terms, phrase_frequencies, phrase_pairs, scored_phrases
Example #24
0
def make_heatmap(heatmap, graph_terms):
    try:
        set_status('getting document list', model=heatmap)
        with ManagedSession() as session:
            filtered_query = create_query_for_model(session, heatmap, dirty=False)
            extracted_terms = extract_terms(filtered_query, heatmap.term_type)
        heatmap_terms = flatten(extracted_terms)
        heatmap_vals = calculate_heatmap_values(heatmap_terms, graph_terms)
        heatmap.terms = json.dumps(jsonize_phrase_dict(heatmap_vals, 'intensity'))
        set_status('heatmap complete', model=heatmap)
        heatmap.finished = True
        heatmap.save()
        return heatmap_vals
    except Exception as e:
        set_status('Error: %s' % e, model=heatmap)
        raise e
Example #25
0
def make_heatmap(heatmap, graph_terms):
    try:
        set_status('getting document list', model=heatmap)
        with ManagedSession() as session:
            heatmap_query= create_query(session, author=heatmap.author, institution=heatmap.institution)
            filtered_query = filter_query(heatmap_query, dirty=False,
                                          starting_year=heatmap.starting_year,
                                          ending_year=heatmap.ending_year,
                                          sample_size=heatmap.sample_size,
                                        model=heatmap)
            extracted_terms = extract_terms(filtered_query, heatmap.term_type)
        heatmap_terms = flatten(extracted_terms)
        heatmap_vals = calculate_heatmap_values(heatmap_terms, graph_terms)
        heatmap.terms = json.dumps(jsonize_phrase_dict(heatmap_vals, 'intensity'))
        set_status('heatmap complete', model=heatmap)
        heatmap.finished = True
        heatmap.save()
        return heatmap_vals
    except Exception as e:
        set_status('Error: %s' % e, model=heatmap)
        raise e
Example #26
0
def fix_primary_flat_file_data(desig,
                               incorrect_list,
                               correct_list,
                               DELETING=False):
    '''

    *** Need to be really careful about this ***
    *** Need to do something like ... ***
    *** (i) Find the relevant primary data file [can be in /sa/mpn or in tot*] 
    *** (ii) Freeze the system (lock status)
    *** (iii) Copy primary data file to temp location (esp. while developing) 
    *** (iv) Find the location of the incorrect data in the primary data file 
    *** (v) Replace the incorrect data with the correct data (non-trivial : needs to have pubn-record, etc) 
    *** (vi) Do some sense checks of the difference between the initial and fixed versions 
    *** (vii)  write the data to the temp file 
    *** (viii) Replace the primary data with the fixed copy
    *** (ix) Unlock the system 

    inputs:
    -------

    returns:
    --------

    '''
    # We want to 'permanently' save some output files ...
    save_dir = '/sa/conchecks/data_products/'

    #*** (i) Find the relevant primary data file [can be in /sa/mpn or in tot*]
    src_files = []
    for incorrect_published_obs80 in incorrect_list:
        src_files.extend(
            find_primary_data_file(desig, incorrect_published_obs80))
        src_files = list(set(src_files))
    print('src_files = ', src_files)
    assert len(
        src_files
    ), f'No src_file could be found that contains the incorrect data ... incorrect_list={incorrect_list}'

    #*** (ii) Freeze the system (lock status)
    # ~~~~~~~~~~~~~ IF WE CRAP OUT AT ANY POINT BELOW WE NEED TO RELEASE THE LOCK ~~~~~~~~~~~~~~~~~~
    print('Setting mpc_temp_status')
    mpc_status.set_status("mpc_temp_status", "MJP_FIXING_PRIMARY_FLAT_FILES")

    try:

        #*** (iii) Copy primary data file to temp location (esp. while developing)
        # I am allowing for the possibility that there are multiple files to be fixed ...
        dst_dir = newsub.generate_subdirectory("obs_cons")
        for src_file in src_files:
            dst_file = os.path.join(dst_dir, os.path.basename(src_file))
            print('dst_file = ', dst_file)
            shutil.copyfile(src_file, dst_file)

            # Read the primary data file
            with open(dst_file, 'r') as fh:
                data = fh.readlines()

            # Files to write to so that MR can update mysql
            bad_filepath = os.path.join(save_dir, desig + '_bad.dat')
            good_filepath = os.path.join(save_dir, desig + '_good.dat')
            print(
                f' bad_filepath= {bad_filepath} , good_filepath= {good_filepath} '
            )
            with open(bad_filepath, 'w') as bad_fh:
                with open(good_filepath, 'w') as good_fh:

                    # If we are deleting duplicates ...
                    if DELETING:

                        seen = {}
                        fixed_data = []
                        incorrect_dict = {_: True for _ in incorrect_list}
                        for line in data:
                            # If the lines are to be deleted, record to tell MR so that the mysql database can be updated
                            if line.strip(
                                    '\n'
                            ) in incorrect_dict and line not in seen:
                                bad_fh.write(line)
                            # If we are keeping the line ...
                            else:
                                fixed_data.append(line)
                            # Record that we have seen the line so that we can stop ourselves deleting it twice!
                            seen[line] = True

                    # If not deleting, but doing replacement ...
                    else:
                        for incorrect_published_obs80, corrected_obs80 in zip(
                                incorrect_list, correct_list):
                            fixed_data = []

                            # Check the inputs
                            assert corrected_obs80 not in ['', ' ', [], [''], ['','']], \
                                'corrected_obs80 = {corrected_obs80} : not sure that this routine can cope with such input ...'
                            assert isinstance(
                                incorrect_published_obs80, str
                            ), f'incorrect_published_obs80 is of type {type(incorrect_published_obs80)}, rather than a string'

                            #*** (iv) Find the location of the incorrect data in the primary data file
                            line_num = [
                                i for i, line in enumerate(data)
                                if incorrect_published_obs80.strip() in line
                            ]
                            assert len(
                                line_num
                            ) < 3, 'len(line_num)={len(line_num)} which is >=3 which seems like a suspiciously large number so I am terminating...'

                            #*** (v) Replace the incorrect data with the correct data (the correct data has been created earlier)
                            #        At the same time we also output the incorrect & correct data to some files to be used to update the MYSQL database
                            for n, line in enumerate(data):
                                if n not in line_num:
                                    # We keep the normal stuff as-is
                                    fixed_data.append(line)
                                else:
                                    # For removal from mysql
                                    bad_fh.write(line)

                                    if isinstance(corrected_obs80, str):
                                        l = corrected_obs80 if corrected_obs80[
                                            -1] == '\n' else corrected_obs80 + '\n'
                                        # Corrected data for flat files
                                        fixed_data.append(l)
                                        # Corrected data for mysql
                                        good_fh.write(l)
                                    elif isinstance(corrected_obs80, list):
                                        for _ in corrected_obs80:
                                            l = _ if _[-1] == '\n' else _ + '\n'
                                            # Corrected data for flat files
                                            fixed_data.append(l)
                                            # Corrected data for mysql
                                            good_fh.write(l)
                                    else:
                                        sys.exit(
                                            f'corrected_obs80 is of type{type(corrected_obs80)}: do not know how to process'
                                        )

                            #*** (vi) Do some sense checks of the difference between the initial and fixed versions
                            assert len(fixed_data) - len(data) == len(
                                line_num
                            ), 'Lengths do not make sense: {len(fixed_data),len(data),len(line_num)} '

                            # copy fixed data into data ready for next loop around ...
                            data = copy.deepcopy(fixed_data)

            #*** (vii)  write the data to the temp file
            replace_file = dst_file + 'replace'
            assert not os.path.isfile(
                replace_file
            ), 'replacement file {replace_file} already exists which is bad'
            with open(replace_file, 'w') as fh:
                for line in fixed_data:
                    l = line if line[-1] == '\n' else line + '\n'
                    fh.write(l)
            assert os.path.isfile(
                replace_file
            ), 'replacement file {replace_file} does NOT exist which is bad'

            #*** (viii) Replace the primary data with the fixed copy
            print(f'replacing file={src_file} with file {replace_file} ')
            #shutil.copyfile(replace_file, src_file)

            #*** (ix) Recreate the index files if necessary
            #         NEED TO BE CAREFUL ABOUT THIS ...
            #         (a) Mike/Dave indicated this is only necessary if the file being altered is one of the permanent,
            #             master files, rather than one of the temp *tot* files
            #         (b) However, my inspection of /share/apps/mpec/publish_dou_mpec.sh, /share/apps/com/indexed/update.sh
            #             (and sub-scripts) suggests that there *ARE* some form of index files for the temp/pending/within-month files
            #         (c) TO gain some understanding, the monthly-prep rebuilds are done here : /sa/com/indexed/update.sh [SAME AS ABOVE]
            #         (d) Given that ... calls /share/apps/com/indexed/buildnumupd.sh

            #*** (xi) Remove / Tidy-up the temp files & temp dir
            #shutil.rmtree(dst_dir)
            #assert not os.path.isdir(dst_dir), f'dst_dir={dst_dir} still exists when it should NOT'

        #*** (ix) Unlock the system
        print('Unsetting the mpc_temp_status')
        mpc_status.set_status("mpc_temp_status", "")

    except Exception as e:
        print('\n' * 2)
        print('EXCEPTION IN fix_primary_flat_file_data')
        print('\n' * 2)
        print(e)
        print('\n' * 2)
        print(
            'Unsetting the mpc_temp_status as part of the EXCEPTION handling')
        mpc_status.set_status("mpc_temp_status", "")

    return True