def index_patent(ipath): """Read patent.tsv. Parameters ---------- ipath : str Path to patent.tsv. Returns ------- :class:`pandas.DataFrame` Data on granted patents. """ print('Indexing patent.') index_name = 'patent_tmp' opath = os.path.join(os.path.dirname(ipath), 'patent.index.tmp.json') create_index(index_name) chunks = pd.read_csv(ipath, sep='\t', quoting=3, lineterminator='\n', dtype=str, chunksize=50000) for chunk in chunks: chunk.drop(columns=[ 'type', 'number', 'country', 'kind', 'num_claims', 'filename', 'withdrawn' ], inplace=True) chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce') chunk.dropna(axis='index', subset=['id', 'title', 'abstract'], how='any', inplace=True) with open(opath, 'w') as ofp: for _, patent in chunk.iterrows(): json.dump({'index': {'_index': index_name}}, ofp) ofp.write('\n') json.dump( { 'id': patent['id'], 'date': str(patent['date'].date()), 'title': patent['title'].lower().strip(), 'abstract': patent['abstract'].lower().strip() }, ofp) ofp.write('\n') bulk_insert(index_name, opath) refresh(index_name) os.remove(opath)
def create_es_index(): """ Initialize a database and create the table if not present and return True """ global es_conn es_conn = es.connect_elasticsearch() created = es.create_index(es_conn, 'matches', index_settings())
def on_post(self, req, resp): cmd = req.get_param('cmd') result = {} if cmd == 'add': book = req.get_param('book') file_path = save_file(book) task_data = {'path': file_path} try: add_book_task.delay(task_data) result = {'msg': 'file putted in queue'} except Exception as e: result = {'error': str(e)} delete_file(file_path) elif cmd == 'create': result = create_index() elif cmd == 'delete': result = delete_index() elif cmd == 'count': result = count_items() elif cmd == 'search': q = req.get_param('q') result = search(q) elif cmd == 'search_advanced': q = req.get_param('q') result = search_advanced(q) resp.body = json.dumps(result) resp.status = falcon.HTTP_200
def create_indices(indices=None, set_aliases=True): result = [] aliases = [] indices = indices or [] now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") for index_alias, type_classes in get_indices(indices).items(): index_settings = recursive_dict_update( getattr( config.settings, 'ELASTICSEARCH_DEFAULT_INDEX_SETTINGS', {} ), getattr( config.settings, 'ELASTICSEARCH_CUSTOM_INDEX_SETTINGS', {} ).get(index_alias, {}) ) index_name = '{0}-{1}'.format(index_alias, now) aliases.append((index_alias, index_name)) type_mappings = {} for type_class in type_classes: tmp = type_class.get_type_mapping() if tmp: type_mappings[type_class.get_type_name()] = tmp result.append(( type_class, index_alias, index_name )) # if we got any type mappings, put them in the index settings if type_mappings: index_settings['mappings'] = type_mappings es.create_index(index_name, index_settings) if set_aliases: create_aliases(aliases) return result, aliases
def index_claim(ipath): """Read claim.tsv. Parameters ---------- ipath : str Path to patent.tsv. Returns ------- :class:`pandas.DataFrame` Patent claims and their dependency. """ print('Indexing claim.') index_name = 'claim_tmp' opath = os.path.join(os.path.dirname(ipath), 'claim.index.tmp.json') create_index(index_name) chunks = pd.read_csv(ipath, sep='\t', quoting=3, lineterminator='\n', dtype=str, chunksize=50000) for chunk in chunks: chunk.drop(columns=['dependent', 'sequence', 'exemplary'], inplace=True) chunk.dropna(axis='index', subset=['uuid', 'text', 'patent_id'], how='any', inplace=True) with open(opath, 'w') as ofp: for _, claim in chunk.iterrows(): json.dump({'index': {'_index': index_name}}, ofp) ofp.write('\n') json.dump( { 'id': claim['patent_id'], 'text': claim['text'].lower().strip() }, ofp) ofp.write('\n') bulk_insert(index_name, opath) refresh(index_name) os.remove(opath)
import es import json import glob import time es.delete_index("messages") es.create_index("messages") nb = 0 for filename in glob.iglob('data/**/*.json', recursive=True): nb += 1 if nb % 500 == 0: time.sleep(1) with open(filename, encoding="utf8") as f: item = json.load(f) es.index("messages", "message", item)