def get_android_reviewFromeAuMarket(): db = MongoManager() translator = Translator() comment_info = { "title": "", "translated_title": "", "score": 0.0, "date": "", "comment": "", "translated_comment": "", "version": None } url = 'https://pass.auone.jp/app/detail/review/list?app_id=3976700000002&sort=post&display_ver=new&page_num=1' headers = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Mobile Safari/537.36' } res = requests.get(url, headers=headers) soup = BeautifulSoup(res.text, "lxml") rows = soup.find_all(match_class(["t-media__body"])) review_count = 0 for row in rows: if len(row.contents) >= 7: # get comment's title comment_info["title"] = row.contents[1].contents[1].contents[0].strip() # get comment's score comment_info["score"] = float(row.contents[3].contents[3].contents[0].strip()) # get comment date comment_info["date"] = datetime.strptime(row.contents[3].contents[5].contents[0].strip(), "%Y/%m/%d") # get comments ; check comment is partial visible (items len = 9) or all visible (items len = 7) comment = "" if len(row.contents) == 9: for i in range(0, len(row.contents[7].contents), 2): comment = comment + row.contents[7].contents[i].strip() comment_info["comment"] = comment else: for i in range(0, len(row.contents[5].contents), 2): comment = comment + row.contents[5].contents[i].strip() comment_info["comment"] = comment # help sender stop if review_count >= REVIEW_COUNT_LIMIT: # when sent up to 5 reviews print('>>>> Limited reviews count.') break if not db.query_is_comments_existed("android_au", comment_info): db.insert_reviews("android_au", comment_info) comment_info["translated_title"] = translator.GetTextAndTranslateFromGoogle('ja', 'zh-TW', comment_info["title"]) comment_info["translated_comment"] = translator.GetTextAndTranslateFromGoogle('ja', 'zh-TW', comment_info["comment"]) # would not send msg when it's datetime before {BEFORE_DAY} if datetime.now() - comment_info['date'] < timedelta(days=BEFORE_DAY): send_message(comment_info, FOOTER[0], FOOTER_ICON[0]) # post message to slack review_count += 1 print('>>>> ' + str(review_count) + ' reviews be sent.')
def get_android_reviewFromGooglePlay(): db = MongoManager() translator = Translator() comment_info = { "title": "", "translated_title": "", "score": 0.0, "date": "", "comment": "", "translated_comment": "", "version": None } url = 'https://play.google.com/_/PlayStoreUi/data?ds.extension=136880256&f.sid=-6639796089229955814&hl=ja&bl=boq_playuiserver_20180731.12_p0&soc-app=121&soc-platform=1&soc-device=1&authuser&_reqid=245019&rt=c' headers = { u'Content-Type': u'application/x-www-form-urlencoded;charset=utf-8' } payload = { u'f.req': u'[[[136880256,[{"136880256":[null,null,[2,2,[40,null]],["com.kddi.android.UtaPass",7]]}],null,null,0]]]'} req = requests.post(url, data=payload, headers=headers).text json_dict = json.loads(get_json(req)) review_count = 0 key = list(json_dict.keys())[0] if len(json_dict[key]) == 0: print('>>>> There has no review data.') return for i in range(len(json_dict[key][0])): # get comment's title comment_info["title"] = json_dict[key][0][i][1][0] # get comment's score comment_info["score"] = json_dict[key][0][i][2] # get comment date comment_info["date"] = datetime.fromtimestamp(json_dict[key][0][i][5][0]) # get comments comment_info['comment'] = json_dict[key][0][i][4] # get version comment_info['version'] = json_dict[key][0][i][10] # help sender stop if review_count >= REVIEW_COUNT_LIMIT: # when sent up to 5 reviews print('>>>> Limited reviews count.') break if not db.query_is_comments_existed("android_gp", comment_info): db.insert_reviews("android_gp", comment_info) comment_info["translated_title"] = translator.GetTextAndTranslateFromGoogle('ja', 'zh-TW', comment_info["title"]) comment_info["translated_comment"] = translator.GetTextAndTranslateFromGoogle('ja', 'zh-TW', comment_info["comment"]) # would not send msg when it's datetime before {BEFORE_DAY} if datetime.now() - comment_info['date'] < timedelta(days=BEFORE_DAY): send_message(comment_info, FOOTER[1], FOOTER_ICON[1]) # post message to slack review_count += 1 print('>>>> ' + str(review_count) + ' reviews be sent.')
class JRCFileParserService(object): ''' This class takes care of reading the input file parsing the text line by line and pushing it into MongoDB. ''' def __init__(self, file_path, db_config, schema, table, batch_size): self.file_path = file_path self.manager = MongoManager(schema, table, batch_size, db_config) def process(self): print "Reading File ", self.file_path count_record = 0 entity_count = 0 similar_record = [] previous_record_id = '0' with open(self.file_path, 'rb') as csvfile: reader = csv.reader(csvfile, delimiter='\t') for row in reader: if previous_record_id != row[0]: self.manager.pushRecords( self.getInsertObject(similar_record)) entity_count += 1 similar_record = [] similar_record.append(row) previous_record_id = row[0] count_record += 1 self.manager.pushRecords(self.getInsertObject(similar_record)) print "Records Processed ", count_record print "Entity Processed ", entity_count return self.manager.flushBatch() def getInsertObject(self, data_list): d = {} d['id'] = int(data_list[0][0]) d['type'] = 'UNKNOWN' if data_list[0][1] == 'P': d['type'] = 'PERSON' if data_list[0][1] == 'O': d['type'] = 'ORGANIZATION' variations = [] compare_strings = [] for r in data_list: v = {} v['lang'] = r[2] v['name'] = r[3] variations.append(v) compare_strings.append(r[3].lower()) d['variations'] = variations d['compare_strings'] = compare_strings return d
def get_ios_review(): db = MongoManager() translator = Translator() comment_info = { "title": "", "translated_title": "", "score": 0.0, "date": "", "comment": "", "translated_comment": "", "version": 0.0 } url = 'https://itunes.apple.com/jp/rss/customerreviews/page=1/id=579510737/sortby=mostrecent/xml' feed = feedparser.parse(url) review_count = 0 for entry in feed.entries[1:]: # get comment's title comment_info["title"] = entry['title'] # get comment's score comment_info["score"] = entry['im_rating'] # get comment date comment_info["date"] = datetime.strptime(entry['updated'], "%Y-%m-%dT%H:%M:%S-07:00") # get comments comment_info['comment'] = entry['content'][0]['value'] # get version comment_info['version'] = entry['im_version'] # help sender stop if review_count >= REVIEW_COUNT_LIMIT: # when sent up to 5 reviews print('>>>> Limited reviews count.') break if not db.query_is_comments_existed("ios", comment_info): db.insert_reviews("ios", comment_info) comment_info["translated_title"] = translator.GetTextAndTranslateFromGoogle('ja', 'zh-TW', comment_info["title"]) comment_info["translated_comment"] = translator.GetTextAndTranslateFromGoogle('ja', 'zh-TW', comment_info["comment"]) # would not send msg when it's datetime before {BEFORE_DAY} if datetime.now() - comment_info['date'] < timedelta(days=BEFORE_DAY): send_message(comment_info) # post message to slack review_count += 1 print('>>>> ' + str(review_count) + ' reviews be sent.')
def __init__(self, db_config, schema, table, batch_size, cameo_table, jrc_table, jrc_cameo_table): self.config = ConfigParser.ConfigParser() self.config.read('../config/config.cnf') self.translator_manager = MongoManager(schema, table, batch_size, db_config) self.cameo_manager = MongoManager(schema, cameo_table, batch_size, db_config) self.jrc_manager = MongoManager(schema, jrc_table, batch_size, db_config) self.jrc_cameo_table = MongoManager(schema, jrc_cameo_table, batch_size, db_config) self.translation = self.translator_manager.get_collection()
def __init__(self, api_infor): self.configparser = ConfigParser() self.configparser.read('../.config') self._api_infor = api_infor self._format = "json" self._userLimit = 100 self._mongodb = "lastfm" self._mongoUserCollection = "user" self._mongoFriendCollection = "friend" if self.mongoclient == None: self.mongoclient = MongoManager.getInstance() def __del__(self): self.mongoclient.close()
def __init__(self, db_config, schema, table, batch_size, cameo_table, jrc_table): self.manager = MongoManager(schema, table, batch_size, db_config) self.cameo_manager = MongoManager(schema, cameo_table, batch_size, db_config) jrc_manager = MongoManager(schema, jrc_table, batch_size, db_config) jrc = jrc_manager.get_collection() jrc_data = jrc.find() self.jrc_cache = {} for jrc_record in jrc_data: jrc_compare_list = jrc_record['compare_strings'] for name in jrc_compare_list: first_char = name[0] jrc_object = { 'name': name, 'jrc_id': jrc_record['id'], '_id': jrc_record['_id'] } if not first_char in self.jrc_cache.keys(): self.jrc_cache[first_char] = [] self.jrc_cache[first_char].append(jrc_object)
import sys import subprocess from MongoManager import MongoManager from CertManager import CertManager mm = MongoManager() cm = CertManager() def add_server(user, alias, ssh_string=None, group=None): if ssh_string is None: return cert_name = user + '_' + alias cm.add_cert(cert_name) mm.add_server(user, ssh_string, alias, cert_name) cm.upload_cert(cert_name, ssh_string) # store .pem cert as bytes in db cert_path = cm.get_cert_path(cert_name) mm.store_cert(user, alias, cert_path) # remove temporary cert cm.rm_temp_cert(user, alias) # download db cert mm.download_cert(user, alias) def remove_server(user, alias, group=None):
class SearchController(tornado.web.RequestHandler): config = ConfigParser.ConfigParser() configPath = '../config/config.cnf' config.read(configPath) db_config = { 'host': config.get('MongoDBConnection', 'db.host'), 'port': config.get('MongoDBConnection', 'db.port'), 'username': config.get('MongoDBConnection', 'db.username'), 'password': config.get('MongoDBConnection', 'db.password') } schema = config.get('MongoDBConnection', 'db.schema') batch_size = config.get('MongoDBConnection', 'db.batch_limit') cameo_table = config.get('Cameo', 'db.Cameo') jrc_table = config.get('JRCNames', 'db.JRCNames') cameo_jrc_table = config.get('CameoJRC', 'db.CameoJRCCountryActor') bablenet_cache = config.get('BableNet', 'db.BableNet.Cache') dbpedia_cache = config.get('DBPedia', 'db.DBPedia.Cache') cameo = MongoManager(schema, cameo_table, batch_size, db_config).get_collection() jrc = MongoManager(schema, jrc_table, batch_size, db_config).get_collection() cameo_jrc = MongoManager(schema, cameo_jrc_table, batch_size, db_config).get_collection() bablenet_cache = MongoManager(schema, bablenet_cache, batch_size, db_config).get_collection() dbpedia_cache = MongoManager(schema, dbpedia_cache, batch_size, db_config).get_collection() def get(self): logger.info( self.config.get('Logging', 'Logger.GetMessage1') + '' + self.request.remote_ip) query = self.get_argument( self.config.get('AccessParameters', 'Access.QueryString')) result = {} result['query'] = query normalized_query = FormatConverterUtil.convertToCompareFormat(query) # Get result from Bablenet and push to object conn = httplib.HTTPConnection('babelnet.io') headers = {"Accept": "application/json"} search_query = "/v4/getSenses?word=" + normalized_query + "&lang=EN&pos=NOUN&filterLangs=AR&filterLangs=ES&key=" + self.config.get( 'BableNet', 'access.key') conn.request("GET", search_query, None, headers) result['bablenet'] = json.loads(conn.getresponse().read()) # Get result from DBpedia and push to object conn = httplib.HTTPConnection('lookup.dbpedia.org') headers = {"Accept": "application/json"} search_query = "/api/search/KeywordSearch?QueryString=" + normalized_query conn.request("GET", search_query, None, headers) result['dbpedia'] = json.loads(conn.getresponse().read()) # get result from caameo and JRC and push to object result['cameojrc'] = self.get_cameo_jrc_result(normalized_query) result['status'] = self.config.get('GeneralMsg', 'Status.success') result['licence'] = self.config.get('GeneralMsg', 'Licence.Ack') self.write(json_encode(result)) self.set_header("Content-Type", "application/json") def get_cameo_jrc_result(self, normalized_query): cameo_jrc_data = self.cameo_jrc.find({ "$or": [{ "cameo_string": normalized_query }, { "jrc_string": normalized_query }] }) cameo_jrc_result = list(cameo_jrc_data) for data in cameo_jrc_result: data['cameo_data'] = list( self.cameo.find({"_id": data['cameo_id']}, { "_id": 0, "compare_strings": 0 })) data['jrc_data'] = list( self.jrc.find({"_id": data['jrc_id']}, { "_id": 0, "compare_strings": 0 })) del data['cameo_id'] del data['jrc_id'] del data['_id'] return cameo_jrc_result
class FilterController(tornado.web.RequestHandler): config = ConfigParser.ConfigParser() configPath = '../config/config.cnf' config.read(configPath) db_config = { 'host': config.get('MongoDBConnection', 'db.host'), 'port': config.get('MongoDBConnection', 'db.port'), 'username': config.get('MongoDBConnection', 'db.username'), 'password': config.get('MongoDBConnection', 'db.password') } schema = config.get('MongoDBConnection', 'db.schema') batch_size = config.get('MongoDBConnection', 'db.batch_limit') cameo_table = config.get('Cameo', 'db.Cameo') jrc_table = config.get('JRCNames', 'db.JRCNames') translation_table = config.get('NameTranslation', 'db.NameTranslation') translation_manager = MongoManager(schema, translation_table, batch_size, db_config) translation = translation_manager.get_collection() def getFilterLang(self, lookup_lang): filter_langs = "" langs = lookup_lang.upper().split(',') if len(langs) == 1: filter_langs = 'filterLangs=' + langs[0] elif len(langs) == 2: filter_langs = 'filterLangs=' + langs[0] + '&filterLangs=' + langs[ 1] elif len(langs) > 2: filter_langs = 'filterLangs=' + langs[0] + '&filterLangs=' + langs[ 1] + '&filterLangs=' + langs[3] return filter_langs def get(self): logger.info( self.config.get('Logging', 'Logger.GetMessage2') + '' + self.request.remote_ip) try: query = self.get_argument( self.config.get('AccessParameters', 'Access.QueryString')) source = self.get_argument(self.config.get('AccessParameters', 'Access.Source'), default='both') filter_lang = self.get_argument( self.config.get('AccessParameters', 'Access.LookupLang'), default=self.config.get('AccessParameters', 'Access.FilterLang')) result = {} result['query'] = query normalized_query = FormatConverterUtil.convertToCompareFormat( query) if source == 'jrc': result['jrc'] = self.get_jrc_result(normalized_query, filter_lang) # return only jrc result elif source == 'bablenet': result['bablenet'] = self.get_bablenet_result( normalized_query, filter_lang) # return only bablenet result else: result['jrc'] = self.get_jrc_result(normalized_query, filter_lang) result['bablenet'] = self.get_bablenet_result( normalized_query, filter_lang) result['status'] = self.config.get('GeneralMsg', 'Status.success') result['licence'] = self.config.get('GeneralMsg', 'Licence.Ack') except MissingArgumentError: result = { "status": "Failed", "Message": "The query string is wrong on not present." } self.write(json_encode(result)) self.set_header("Content-Type", "application/json") def get_result_database(self, normalized_query, filter_lang, source): language_array = [] f = filter_lang.lower().split(',') for r in f: d = {} d['lang'] = r language_array.append(d) translation_data = self.translation.find({ "$and": [{ "cameo_name": normalized_query }, { "source": source }, { "$or": language_array }] }) translation_result = list(translation_data) for data in translation_result: del data['cameo_name'] del data['creation_timestamp'] del data['_id'] return translation_result def get_bablenet_result(self, normalized_query, filter_lang): cache_data = self.get_result_database(normalized_query, filter_lang, 'bablenet') if len(cache_data) < 1: # Get result from Bablenet and push to object lookup_lang = self.getFilterLang(filter_lang) conn = httplib.HTTPConnection('babelnet.io') headers = {"Accept": "application/json"} search_query = "/v4/getSenses?word=" + normalized_query + \ "&lang=EN&pos=NOUN&" + lookup_lang + "&key=" \ + self.config.get('BableNet', 'access.key') conn.request("GET", search_query, None, headers) raw_result = json.loads(conn.getresponse().read()) #print raw_result conn.close() is_flush = False for record in raw_result: is_flush = True self.translation_manager.pushRecords( self.getInsertObject(normalized_query, "bablenet", record['language'].lower(), record['lemma'].replace('_', ' '))) if is_flush: self.translation_manager.flushBatch() cache_data = self.get_result_database(normalized_query, filter_lang, 'bablenet') logger.info( self.config.get('Logging', 'Logger.GetMessage3') + '' + normalized_query) return cache_data def getInsertObject(self, cameo_name, source, lang, name): d = {} d['cameo_name'] = cameo_name d['source'] = source d['lang'] = lang d['name'] = name d['creation_timestamp'] = datetime.now() return d def get_jrc_result(self, normalized_query, filter_lang): result = self.get_result_database(normalized_query, filter_lang, 'jrc') for r in result: r['names'] = r['names'].replace('+', ' ') return result
config.read(config_file) db_config = { 'host': config.get('MongoDBConnection', 'db.host'), 'port': config.get('MongoDBConnection', 'db.port'), 'username': config.get('MongoDBConnection', 'db.username'), 'password': config.get('MongoDBConnection', 'db.password') } schema = config.get('MongoDBConnection', 'db.schema') batch_size = config.get('MongoDBConnection', 'db.batch_limit') cameo_table = config.get('Cameo', 'db.Cameo') jrc_table = config.get('JRCNames', 'db.JRCNames') target_table = config.get('CameoJRC', 'db.CameoJRCCountryActor') #No of entities in Cameo.Phoenix.Countries.actors manager = MongoManager(schema, cameo_table, batch_size, db_config) cameo = manager.get_collection() cameo_data = cameo.find({"record_type": "Cameo.Phoenix.Countries.actors"}) counter = 0 for cameo_record in cameo_data: cameo_compare_list = cameo_record['compare_strings'] counter += len(cameo_compare_list) print "# of entities in Cameo.Phoenix.Countries.actors: ", counter cameo_data = cameo.find( {"record_type": "Cameo.Phoenix.International.actors"}) counter = 0 for cameo_record in cameo_data: cameo_compare_list = cameo_record['compare_strings'] counter += len(cameo_compare_list) print "# of entities in Cameo.Phoenix.International.actors: ", counter
class nameTranslatorService(object): def __init__(self, db_config, schema, table, batch_size, cameo_table, jrc_table, jrc_cameo_table): self.config = ConfigParser.ConfigParser() self.config.read('../config/config.cnf') self.translator_manager = MongoManager(schema, table, batch_size, db_config) self.cameo_manager = MongoManager(schema, cameo_table, batch_size, db_config) self.jrc_manager = MongoManager(schema, jrc_table, batch_size, db_config) self.jrc_cameo_table = MongoManager(schema, jrc_cameo_table, batch_size, db_config) self.translation = self.translator_manager.get_collection() def process(self, process_jrc, process_bable_net): cameo = self.cameo_manager.get_collection() jrc = self.jrc_manager.get_collection() jrc_cameo = self.jrc_cameo_table.get_collection() # Parse and get all JRC Names if process_jrc: cameo_data = cameo.find({}, no_cursor_timeout=True) counter = 0 for cameo_record in cameo_data: cameo_compare_list = cameo_record['compare_strings'] for c in cameo_compare_list: if len(c) > 1: jrc_cameo_data = jrc_cameo.find({"cameo_string": c}) for record_object in jrc_cameo_data: jrc_data = jrc.find_one( {"_id": record_object['jrc_id']}) for variation in jrc_data['variations']: counter += 1 self.translator_manager.pushRecords( self.getInsertObject( c, "jrc", variation['lang'], variation['name'])) print "Total Translation Found : ", counter cameo_data.close() # Parse and get all BableNet Names # To filter record to get web service - in produciton we can remove and run for all # {"record_type": "Cameo.Phoenix.Countries.actors", "cameo_title": "PRESIDENT_OF_THE_UNITED_STATES_"} if process_bable_net: cameo_data = cameo.find( { "record_type": "Cameo.Phoenix.Countries.actors", "cameo_title": "PRESIDENT_OF_THE_UNITED_STATES_" }, no_cursor_timeout=True) for cameo_record in cameo_data: cameo_compare_list = cameo_record['compare_strings'] for c in cameo_compare_list: if len(c) > 1: self.get_bablenet_result( c, self.config.get('AccessParameters', 'Access.FilterLang')) cameo_data.close() return self.translator_manager.flushBatch() def get_bablenet_result(self, cameo_string, filter_lang): normalized_query = FormatConverterUtil.convertToCompareFormat( cameo_string.replace('+', ' ').strip(' ')) cache_data = self.get_result_database(normalized_query, filter_lang, 'bablenet') if len(cache_data) < 1: # Get result from Bablenet and push to object lookup_lang = self.getFilterLang(filter_lang) conn = httplib.HTTPConnection('babelnet.io') headers = {"Accept": "application/json"} search_query = "/v4/getSenses?word=" + normalized_query + \ "&lang=EN&pos=NOUN&" + lookup_lang + "&key=" \ + self.config.get('BableNet', 'access.key') print search_query conn.request("GET", search_query, None, headers) raw_result = json.loads(conn.getresponse().read()) #print raw_result conn.close() counter = 0 is_flush = False for record in raw_result: counter += 1 is_flush = True self.translator_manager.pushRecords( self.getInsertObject(normalized_query, "bablenet", record['language'].lower(), record['lemma'].replace('_', ' '))) print "Total Translation Found : ", counter if is_flush: self.translator_manager.flushBatch() cache_data = self.get_result_database(normalized_query, filter_lang, 'bablenet') return cache_data def getFilterLang(self, lookup_lang): filter_langs = "" langs = lookup_lang.upper().split(',') if len(langs) == 1: filter_langs = 'filterLangs=' + langs[0] elif len(langs) == 2: filter_langs = 'filterLangs=' + langs[0] + '&filterLangs=' + langs[ 1] elif len(langs) > 2: filter_langs = 'filterLangs=' + langs[0] + '&filterLangs=' + langs[ 1] + '&filterLangs=' + langs[3] return filter_langs def get_result_database(self, normalized_query, filter_lang, source): language_array = [] f = filter_lang.lower().split(',') for r in f: d = {} d['lang'] = r language_array.append(d) translation_data = self.translation.find({ "$and": [{ "cameo_name": normalized_query }, { "source": source }, { "$or": language_array }] }) translation_result = list(translation_data) for data in translation_result: del data['cameo_name'] del data['creation_timestamp'] del data['_id'] return translation_result def getInsertObject(self, cameo_name, source, lang, name): d = {} d['cameo_name'] = cameo_name d['source'] = source d['lang'] = lang d['name'] = name d['creation_timestamp'] = datetime.now() return d
def __init__(self, file_dict, db_config, schema, table, batch_size): self.file_paths = file_dict self.manager = MongoManager(schema, table, batch_size, db_config)
class CameoJRCRelationService(object): ''' This class takes care of loading the cameo and JC data and do string match line by line and pushing it into MongoDB. ''' def __init__(self, db_config, schema, table, batch_size, cameo_table, jrc_table): self.manager = MongoManager(schema, table, batch_size, db_config) self.cameo_manager = MongoManager(schema, cameo_table, batch_size, db_config) jrc_manager = MongoManager(schema, jrc_table, batch_size, db_config) jrc = jrc_manager.get_collection() jrc_data = jrc.find() self.jrc_cache = {} for jrc_record in jrc_data: jrc_compare_list = jrc_record['compare_strings'] for name in jrc_compare_list: first_char = name[0] jrc_object = { 'name': name, 'jrc_id': jrc_record['id'], '_id': jrc_record['_id'] } if not first_char in self.jrc_cache.keys(): self.jrc_cache[first_char] = [] self.jrc_cache[first_char].append(jrc_object) def process(self): cameo = self.cameo_manager.get_collection() cameo_data = cameo.find({}, no_cursor_timeout=True) counter = 0 edit_distance_counter = [0, 0, 0, 0, 0, 0] for cameo_record in cameo_data: cameo_compare_list = cameo_record['compare_strings'] for c in cameo_compare_list: if len(c) > 1: first_char_cameo = c[0] if first_char_cameo in self.jrc_cache.keys(): jrc_bucket = self.jrc_cache[first_char_cameo] for j in jrc_bucket: edit_distance = int(editdistance.eval( c, j['name'])) if edit_distance < 3: counter += 1 edit_distance_counter[edit_distance] += 1 self.manager.pushRecords( self.getInsertObject( cameo_record['_id'], j['_id'], cameo_record['cameo_title'], j['jrc_id'], c, j['name'], edit_distance, cameo_record['record_type'])) print "Total Matches Found : ", counter print "Edit Distance counter : ", edit_distance_counter cameo_data.close() return self.manager.flushBatch() def getInsertObject(self, cameo_id, jrc_id, cameo_title, jc_code, cameo_string, jrc_string, edit_distance, record_type): d = {} d['cameo_id'] = cameo_id d['edit_distance'] = edit_distance d['jrc_id'] = jrc_id d['cameo_title'] = cameo_title d['jrc_code'] = jc_code d['cameo_string'] = cameo_string d['jrc_string'] = jrc_string d['record_type'] = record_type return d
from flask import Flask from flask import jsonify from flask import request from flask_pymongo import PyMongo from CertManager import CertManager from MongoManager import MongoManager app = Flask(__name__) app.config['MONGO_DBNAME'] = 'fedssh' app.config['MONGO_URI'] = 'mongodb://*****:*****@app.route('/all_servers', methods=['GET']) def get_all_servers(): return jsonify({'servers': mm.get_all_servers()}) @app.route('/generate_cert', methods=['POST']) def generate_cert(): unique_name = request.json['name'] #all_certs = (subprocess.run(['ls', CERTS_LOC])).stdout #star = mongo.db.stars """ output = [] for s in star.find():
config_file = args.config config.read(config_file) input_file = config.get('JRCNames', 'JRCNames.entityfile') db_config = { 'host': config.get('MongoDBConnection', 'db.host'), 'port': config.get('MongoDBConnection', 'db.port'), 'username': config.get('MongoDBConnection', 'db.username'), 'password': config.get('MongoDBConnection', 'db.password') } schema = config.get('MongoDBConnection', 'db.schema') table = config.get('JRCNames', 'db.JRCNames') batch_size = config.get('MongoDBConnection', 'db.batch_limit') cleaner = MongoManager(schema, table, batch_size, db_config) cleaner.drop_collection() processor = JRCFileParserService(input_file, db_config, schema, table, batch_size) processComplete = processor.process() if processComplete: print "DONE" else: print "NOT FINISHED" print "Finished @ ", str(datetime.now())
class CAMEOFileParserService(object): ''' This class takes care of reading the input file parsing the text line by line and pushing it into MongoDB. ''' def __init__(self, file_dict, db_config, schema, table, batch_size): self.file_paths = file_dict self.manager = MongoManager(schema, table, batch_size, db_config) def process(self): for k, v in self.file_paths.iteritems(): print "Reading File ", v count_record = 0 entity_count = 0 with open(v, 'rb') as fileObj: lines = fileObj.readlines() entity_list = [] for row in lines: if row[0] == '#': # Ignore line if start with comment continue count_record += 1 if k == 'Cameo.Phoenix.agents': if row[0] == '!': # Ignore line if start with ! continue data_list = row.rstrip('\n').split(' ') if len(data_list) < 2: # Ignore lines that are blank continue self.manager.pushRecords( self.getInsertObject(data_list, k)) entity_count += 1 elif k == 'Cameo.Phoenix.Countries.actors': if row == '\n': if len(entity_list) > 1: self.manager.pushRecords( self.getInsertObject(entity_list, k)) entity_count += 1 entity_list = [] record = row.rstrip('\n') data_list = [] if len(record) > 1 and record[0] == '\t': data_list.append(record.replace('\t', '')) else: data_list = record.replace(' # CountryInfo.txt', '#ACTOR#').split(' ') for d in data_list: if len(d) > 0: entity_list.append(d) elif k == 'Cameo.Phoenix.International.actors' or k == 'Cameo.Phoenix.MilNonState.actors': record = row.rstrip('\n') if len(record) > 2 and record[0] != '+': if record[0] == '\t': #ignore tab lines continue if len(entity_list) > 1: self.manager.pushRecords( self.getInsertObject(entity_list, k)) entity_count += 1 entity_list = [] data_list = [] if len(record) > 1: data_list = record.replace(' ', '').replace( '#', '#$').split('#') for d in data_list: if len(d) > 0: entity_list.append(d) # elif k == 'Cameo.Phoenix.MilNonState.actors': # pass print "Lines Processed for ", k, " is : ", count_record print "Records created for ", k, " is : ", entity_count return self.manager.flushBatch() def getInsertObject(self, data_list, type): d = {} d['record_type'] = type d['cameo_title'] = data_list[0] if type == 'Cameo.Phoenix.agents' and len(data_list) > 1: t = data_list[1].replace('[~', '') t = t.replace(']', '') d['cameo_code_category'] = t del data_list[1] elif type == 'Cameo.Phoenix.Countries.actors': # Setup Country code t = data_list[1].replace('[', '') t = t.replace(']', '') d['cameo_country_code'] = t del data_list[1] # setup variation of country name country_name_variation = [] for data in data_list: if data[0] == '+': data = data[1:] country_name_variation.append(data) d['cameo_country_name_variation'] = country_name_variation # setup county actors and their position country_actors = [] a = {} actor_period = [] for index, val in enumerate(data_list): if val.endswith('#ACTOR#'): if len(a) > 0: a['cameo_actor_period'] = actor_period country_actors.append(a) actor_period = [] a = {} a['cameo_actor_name'] = val.replace('#ACTOR#', '') if val[0] == '[' and val[-1] == ']': actor_period.append(val) d['cameo_country_actors'] = country_actors elif type == 'Cameo.Phoenix.International.actors' or type == 'Cameo.Phoenix.MilNonState.actors': # setup variation of country name actor_name_variation = [] for data in data_list: if data[0] == '+': data = data[1:] actor_name_variation.append(data) d['cameo_actor_name_variation'] = actor_name_variation actor_orgs = [] for data in data_list: if len(data) > 2 and data[0] == '$': actor_orgs.append(data[1:]) d['cameo_actor_orgs'] = actor_orgs #This is common for all types d['compare_strings'] = [] for data in data_list: if len(data) > 1: if data[0] == '[' and data[-1] == ']': continue tmp_str = data.replace('#ACTOR#', '').lower().replace( '_', '+').replace(' ', '+').replace('\t', '') tmp_str = self.remove_text_paranthesis(tmp_str) if len(tmp_str) > 2 and tmp_str[0] == '+': tmp_str = tmp_str[1:] elif len(tmp_str) > 2 and tmp_str[-1] == '+': tmp_str = tmp_str[:-1] elif len(tmp_str) > 2 and tmp_str[0] == '$': continue d['compare_strings'].append(tmp_str) return d def remove_text_paranthesis(self, sentence): ret = '' skip1c = 0 skip2c = 0 for i in sentence: if i == '[': skip1c += 1 elif i == '(': skip2c += 1 elif i == ']' and skip1c > 0: skip1c -= 1 elif i == ')' and skip2c > 0: skip2c -= 1 elif skip1c == 0 and skip2c == 0: ret += i return ret