def create_search_index(argv=None):
    """Create NMSLib index and a reverse lookup CSV file.

  This routine reads a list CSV data files at a given
  directory, combines them into one for reverse lookup
  and uses the embeddings string to create an NMSLib index.
  This embedding is the last column of all CSV files.

  Args:
    argv: A list of strings representing command line arguments.
  """
    tf.logging.set_verbosity(tf.logging.INFO)

    args = arguments.parse_arguments(argv)

    if not os.path.isdir(args.tmp_dir):
        logging.info("Creating directory %s", args.tmp_dir)
        os.makedirs(args.tmp_dir)

    tmp_index_file = os.path.join(args.tmp_dir,
                                  os.path.basename(args.index_file))
    tmp_lookup_file = os.path.join(args.tmp_dir,
                                   os.path.basename(args.lookup_file))

    embeddings_data = []

    with open(tmp_lookup_file, 'w') as lookup_file:
        lookup_writer = csv.writer(lookup_file)

        for csv_file_path in tf.gfile.Glob('{}/*index*.csv'.format(
                args.data_dir)):
            logging.info('Reading %s', csv_file_path)

            with tf.gfile.Open(csv_file_path) as csv_file:
                reader = csv.reader(csv_file)
                for row in reader:
                    embedding_string = row[-1]
                    embedding_vector = [
                        float(value) for value in embedding_string.split(',')
                    ]
                    embeddings_data.append(embedding_vector)

                    lookup_writer.writerow(row[:-1])

    embeddings_data = np.array(embeddings_data)

    search_engine.CodeSearchEngine.create_index(embeddings_data,
                                                tmp_index_file)

    logging.info("Copying file %s to %s", tmp_lookup_file, args.lookup_file)
    tf.gfile.Copy(tmp_lookup_file, args.lookup_file)
    logging.info("Copying file %s to %s", tmp_index_file, args.index_file)
    tf.gfile.Copy(tmp_index_file, args.index_file)
    logging.info("Finished creating the index")
Beispiel #2
0
def start_search_server(argv=None):
    """Start a Flask REST server.

  This routine starts a Flask server which maintains
  an in memory index and a reverse-lookup database of
  Python files which can be queried via a simple REST
  API. It also serves the UI for a friendlier interface.

  Args:
    argv: A list of strings representing command line arguments.
  """
    tf.logging.set_verbosity(tf.logging.INFO)

    args = arguments.parse_arguments(argv)

    if not os.path.isdir(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    tf.logging.debug('Reading {}'.format(args.lookup_file))
    lookup_data = []
    with tf.gfile.Open(args.lookup_file) as lookup_file:
        reader = csv.reader(lookup_file)
        for row in reader:
            lookup_data.append(row)

    tmp_index_file = os.path.join(args.tmp_dir,
                                  os.path.basename(args.index_file))

    tf.logging.debug('Reading {}'.format(args.index_file))
    if not os.path.isfile(tmp_index_file):
        tf.gfile.Copy(args.index_file, tmp_index_file)

    # Build an an encoder for the natural language strings.
    query_encoder = build_query_encoder(args.problem,
                                        args.data_dir,
                                        embed_code=False)
    embedding_fn = functools.partial(embed_query, query_encoder,
                                     args.serving_url)

    search_engine = CodeSearchEngine(tmp_index_file, lookup_data, embedding_fn)
    search_server = CodeSearchServer(search_engine,
                                     args.ui_dir,
                                     host=args.host,
                                     port=args.port)
    search_server.run()