def cleStart(): ## base from json import load as j_loads v_lst = ("p2p_packet_info.txt", "packet_info.txt", "profile.txt", "stdout", "syslog", "syserr", "usage.txt", "VERSION.txt", "DEV_LOG.log", "mob_count", "*.core") szPWD = os_getcwd() ## clear files from alog with open("clear.list", "r") as fList: mList = j_loads(fList) for dic1 in mList: # goto alog path # print dic1["path"] os_chdir(dic1["path"]) # clean files fShell("cat /dev/null > PTS") fShell("rm -rf log/* cores/*") # goto base again os_chdir(szPWD) ## clean other logs with open("start.list", "r") as fList: mList = j_loads(fList) for dic1 in mList: # goto alog path # print dic1["path"] os_chdir(dic1["path"]) fShell("echo --- delete inside '%s' ---" % dic1["path"]) fShell("rm -fv %s" % " ".join(v_lst)) # goto base again os_chdir(szPWD)
def __api_request(service_label, params, index_name=None): """Wraps the access to the Nordlys API. It returns a 3-uple (results, total no. of results, pretty status message). :param service_label: a constant for the required service_label. :param params: request params. :param index_name: optional; name of index. :return: a list of docIDs. """ results = None # default init, it remains None if request returns error total = 0 msg = "" url = "/".join([PROTOCOL, SERVER_HOSTNAME_API, service_label]) if service_label == SERVICE_E_RETRIEVAL: url += "?q={}&model={}&start={}&1st_num_docs={}&fields_return={}".format( quote(params.get("q", "")), params.get("model", "lm"), params.get("start", 0), params.get("1st_num_docs", 100), params.get("fields_return", "abstract"), ) url += "&num_docs={}".format(params.get("num_docs", NUM_RESULTS)) elif service_label == SERVICE_E_LINKING: url += "?q={}".format(quote(params.get("q", ""))) elif service_label == SERVICE_TTI: url += "?q={}&method={}&num_docs={}&start={}&index={}&field={}".format( quote(params.get("q", "")), params.get("method", "tc"), params.get("num_docs", NUM_RESULTS), params.get("start", 0), params.get("index", TTI_INDEX_FALLBACK_2015_10), params.get("field", "_id")) try: print("Service request' URL: {}".format(url)) r = requests_get(url, timeout=REQUEST_TIMEOUT) print(r) results = j_loads(r.text) total = results.get("total_hits", 0) # Obtain postprocessed results to render, if needed entity_collection = MONGO_ENTITY_COLLECTIONS[0] if len( MONGO_ENTITY_COLLECTIONS) > 0 else "dbpedia-2015-10" results = process_results(results, service_label, protocol=PROTOCOL, server_hostname_api=SERVER_HOSTNAME_API, entity_collection=entity_collection, request_timeout=REQUEST_TIMEOUT) except ConnectionError: msg = "We're so sorry. There was a connection error :(" except Timeout: msg = "Timeout while trying to connect to the remote server, or while receiving data from it :(" except JSONDecodeError: msg = "There are no results for your query :(" return results, total, msg
def _on_full_final_result(self, _, json_str): Gdk.threads_enter() if self.keywords != "": #prev_text = self.phrasebuf.get_text() string = "----------------------------\n" json_data = j_loads(json_str) transcript = json_data["result"]["hypotheses"][0]['transcript'] for word in self.keywords.split(): cnt = transcript.count(word) substring = word + " (" + str(cnt) + ")\n" string += substring self.phrasebuf.begin_user_action() self.phrasebuf.insert_at_cursor(string) self.phrasebuf.end_user_action() Gdk.threads_leave()
def _handle_full_final_result(self, _, json_str): keyword = self.keyword json_data = j_loads(json_str) result = [] kw_list = keyword.split(" ") # represents the phrase splitted on words word_alignment = json_data["result"]["hypotheses"][0]["word-alignment"] # summing the time stamp of a word with segment-start time for i in word_alignment: i["start"] += json_data["segment-start"] # making search of any occurances of the keyword for i, j in enumerate(word_alignment): # in case come across the first element in kw_list if j["word"] == kw_list[0]: kw_list_length = len(kw_list) slice = word_alignment[i: i + kw_list_length] # if the amount of words in the phrase equal to 1 # then save the slice and continue looping flag = True if kw_list_length != 1: # else we check whether all sequential elements in slice # equal to words in phrase or not iter = 0 for item in kw_list: if iter == len(slice): flag = False break if item == slice[iter]["word"] and iter != 0: pass elif iter != 0 and item != slice[iter]["word"]: flag = False iter += 1 if flag is True: print(slice) result.append(slice[0]["start"]) self.timestamps.extend(result) self._display_found_timestamps(result) self.text_widget.config(state=NORMAL) self.text_widget.insert(END, "{0} ".format(json_data["result"]["hypotheses"][0]["transcript"])) self.text_widget.highlight_pattern(r"\b{0}\b".format(self.keyword), "red", len(self.keyword)) self.text_widget.see(END) self.text_widget.config(state=DISABLED)
def staInit(): global proclist ## base from json import load as j_loads with open("start.list", "r") as fList: mList = j_loads(fList) proclist.clear() for dic1 in mList: keyCheck(proclist, dic1["serv"]) if dic1["type"]==M2TYPE.DB: keyCheck(proclist[dic1["serv"]], "db", []) proclist[dic1["serv"]]["db"].append(dic1) elif dic1["type"]==M2TYPE.AUTH or dic1["type"]==M2TYPE.CORE: keyCheck(proclist[dic1["serv"]], "core", []) proclist[dic1["serv"]]["core"].append(dic1) if dic1["type"]==M2TYPE.CORE: keyCheck(proclist[dic1["serv"]], "chan", set()) proclist[dic1["serv"]]["chan"].add(dic1["chan"])
def __api_request(service_label, params, index_name=None): """Wraps the access to the Nordlys API. It returns a 3-uple (results, total no. of results, pretty status message). :param service_label: a constant for the required service_label. :param params: request params. :param index_name: optional; name of index. :return: a list of docIDs. """ results = None # default init, it remains None if request returns error total = 0 msg = "" url = "" if service_label == SERVICE_E_RETRIEVAL: url = "/".join([PROTOCOL, SERVER_HOSTNAME_API, "er", index_name]) url += "?q={}&model={}&start={}&1st_num_docs={}&fields_return={}".format( quote(params.get("q", "")), params.get("model", "lm"), params.get("start", 0), params.get("1st_num_docs", 100), params.get("fields_return", "abstract"), ) url += "&num_docs={}".format(params.get("num_docs", NUM_RESULTS)) if service_label == SERVICE_E_LINKING: url = "/".join([PROTOCOL, SERVER_HOSTNAME_API, "el"]) url += "?q={}".format(quote(params.get("q", ""))) elif service_label == SERVICE_TTI: # TODO ideal with API url = "/".join([PROTOCOL, SERVER_HOSTNAME_API, "types"]) url += "?q={}&method={}&num_docs={}&start={}&index={}&field={}".format( quote(params.get("q", "")), params.get("method", "tc"), params.get("num_docs", NUM_RESULTS), params.get("start", 0), params.get("index", TTI_INDEX_FALLBACK_2015_10), params.get("field", "_id")) # TODO working on local after tunneling Elastic # url = "/".join([PROTOCOL, "localhost:8080", index_name, "_search?q={}&fields=_id".format( # quote(params.get("q", "")))]) # TODO working on gustav1 directly to Elastic # url = "/".join([PROTOCOL, SERVER_HOSTNAME_ELASTIC, index_name, "_search?q={}&fields=_id".format( # quote(params.get("q", "")))]) try: print(url) r = requests_get(url, timeout=REQUEST_TIMEOUT) # TODO AJAXify this? results = j_loads(r.text) # total = WWW_PAGINATION_MAX_RESULTS_ER total = len(results.get("results", 0)) # Obtain postprocessed results to render, if needed results = process_results(results, service_label) except ConnectionError: msg = "We're so sorry. There was a connection error :(" except Timeout: msg = "Timeout while trying to connect to the remote server, or while receiving data from it :(" return results, total, msg
src = soup.select('script[src*="ProfilePageContainer.js/"]')[0]['src'] queryHashPhoto = get_query_hash_for_get_new_photo(src) src = soup.select('script[src*="/Consumer.js/"]')[0]['src'] queryHash = get_query_hash( src ) # queryHash = '477b65a610463740ccdb83135b2014db' # create threads and log for i in range(20): allThread.append( Parser() ) allThread[i].start() Log().start() # parse script = soup.find( 'body' ).find( 'script' ) shareData = str(script)[52:-10] shareData = j_loads( shareData ) userId = shareData['entry_data']['ProfilePage'][0]['logging_page_id'].split( '_' )[-1] nowJpg = shareData['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'] lastJpg = shareData['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor'] while True: for infoJpg in nowJpg: infoJpg = infoJpg['node'] allJpg.put(infoJpg) if lastJpg is None: break nowJpg, lastJpg = get_new_data( queryHashPhoto, userId, lastJpg ) Log.end_cursor = lastJpg WORK = False for th in allThread: th.join() Log.work = False end = time.time()
for h_id in hotel_ids[:100]: if str(h_id) in parsed_hotels: continue reviews = [] reviews_raw = [] start = 0 items = 10 while True: temp = r_get(source.format(h_id, start, items), headers) if temp.status_code != 200: with codecs.open('errors_hotels.txt', 'a+', 'utf-8') as f: f.write(str(h_id) + str(start) + '\n') break temp = j_loads(temp.content[4:-2]) if not temp[u'reviewDetails'][u'numberOfReviewsInThisPage']: break reviews += temp[u'reviewDetails'][u'reviewCollection']['review'] reviews_raw.append(temp) start += 10 sleep(1) if len(reviews) > 0: reviews_df = pd.DataFrame(reviews)[cols_to_remain] reviews_df.to_csv('reviews.csv', mode='a+', header=False, index=False, encoding='utf-8') with codecs.open('reviews_rav.txt', 'a+', 'utf-8') as f:
def process_results(raw_results, service_label, protocol="http:/", server_hostname_api="", entity_collection="", request_timeout=30): """Processes a list of raw results to obtain further data components. :param raw_results: a list of raw results (typically docIDs). :param service_label: a constant for the required service_label. :return: """ results = [] if service_label is SERVICE_E_RETRIEVAL: sorted_ranks = __sort_set_of_str_elems( raw_results.get("results", {}).keys()) for rank in sorted_ranks: result_dict = raw_results.get("results", {}).get(rank, {}) entity_id = result_dict.get("entity", "") if entity_id == "": continue unprefixed_doc_id = entity_id.split(":")[-1].split(">")[0] abstract_list = result_dict.get("fields", {}).get("abstract", []) abstract = abstract_list[0] if len(abstract_list) > 0 else "" # Entity catalog request for making entity cards and getting Freebase ID url = "/".join([protocol, server_hostname_api, "ec", entity_id]) try: # print("\tCatalog request' URL: {}".format(url)) r = requests_get(url, timeout=request_timeout) catalog_results = j_loads(r.text) except: catalog_results = dict() card_data = __obtain_card_data(catalog_results) # Final result dict result = { RESULT_DOC_TITLE_K: unprefixed_doc_id.replace("_", " "), RESULT_DOC_ID_K: entity_id, RESULT_DOC_SNIPPET_K: __shorten_abstract(abstract), RESULT_URL_DBPEDIA_K: "/".join([ DBPEDIA_HOSTNAME, SERVICE_TO_DBPEDIA_SUBHOST[service_label], unprefixed_doc_id ]), RESULT_URL_WIKIPEDIA_K: "/".join([ WIKIPEDIA_HOSTNAME, SERVICE_TO_WIKIPEDIA_SUBHOST[service_label], unprefixed_doc_id ]), RESULT_FREEBASE_ID_K: catalog_results.get("fb:<owl:sameAs>", [None])[0], RESULT_DOC_CARD_K: card_data } results.append(result) elif service_label is SERVICE_E_LINKING: query = raw_results.get("processed_query", {}) linked_results = raw_results.get("results", {}) result_counter = 0 for result_l in sorted(linked_results, key=lambda k: k['score'], reverse=True): result_counter += 1 entity_id = result_l['entity'] score = result_l['score'] unprefixed_doc_id = entity_id.split(":")[-1].split(">")[0] entity_url = "/".join([ DBPEDIA_HOSTNAME, SERVICE_TO_DBPEDIA_SUBHOST[service_label], unprefixed_doc_id ]) # Entity catalog request for getting popup picture and abstract url = "/".join([protocol, server_hostname_api, "ec", entity_id]) try: # print("\tCatalog request' URL: {}".format(url)) r = requests_get(url, timeout=request_timeout) catalog_results = j_loads(r.text) except: catalog_results = dict() # Defining result components picture = __get_card_picture(catalog_results) # possibly None most_specific_type = __get_card_type( catalog_results) # possibly None if most_specific_type: most_specific_type = most_specific_type.upper() abstract = __shorten_abstract(catalog_results.get( "<dbo:abstract>", [""])[0], max_length=400) formatted_result = query.replace( unprefixed_doc_id.lower(), "<a href=\"{}\" target=\"_blank\" id=\"elLink{}\" " "onmouseover=\"showPop(\'elPop{}\', event);\"" " onmouseout=\"hidePop(\'elPop{}\');\"" # NOTE: important the blank between each event handler ">" "{}</a>".format(entity_url, result_counter, result_counter, result_counter, unprefixed_doc_id.lower())) result = { #RESULT_LINKED_SUBSTR_K: linked_substr, RESULT_DOC_TITLE_K: unprefixed_doc_id.replace("_", " "), RESULT_DOC_ID_K: entity_id[1:-1] if len(entity_id) > 1 else "", RESULT_DOC_SNIPPET_K: abstract, RESULT_DOC_PICTURE_K: picture, RESULT_URL_DBPEDIA_K: entity_url, RESULT_DOC_SCORE_K: round(score, 6), RESULT_DOC_TYPE_K: most_specific_type, RESULT_EL_TO_SHOW_K: formatted_result } results.append(result) elif service_label is SERVICE_TTI: # TODO remap to exp the TTI scores from API when method is LM sorted_ranks = __sort_set_of_str_elems( raw_results.get("results", {}).keys()) for rank in sorted_ranks[0:10]: result_dict = raw_results.get("results", {}).get(rank, {}) type_id = result_dict.get("type", "") if __must_be_skipped(type_id): continue unprefixed_doc_id = type_id.split(":")[-1].split(">")[0] result = { RESULT_DOC_TITLE_K: __convert_from_camelcase(unprefixed_doc_id).replace("_", " "), RESULT_DOC_ID_K: type_id, RESULT_DOC_SCORE_K: round(result_dict.get("score", 0), 6), RESULT_URL_DBPEDIA_K: "/".join([ DBPEDIA_HOSTNAME, SERVICE_TO_DBPEDIA_SUBHOST[service_label], unprefixed_doc_id ]) } results.append(result) return results