def queryRelatedWos(WOS, start_time): root_list = [] with WosClient(c.getUserName(), c.getPassword()) as client: print("Starting to look for related records") for i, id in enumerate(WOS): try: records = wos.utils.get_related_records(client, id, count=8) roots = records.findall("REC") for root in roots: root_list.append(root) except: wait = 5 print("Some error while adding roots. Waiting {} sec".format(wait)) time.sleep(wait) if (i+1) % 50 == 0: print("Queries so far is {}. Time used is {:0.1f}".format((i+1), ((time.time()-start_time)/60.0))) time.sleep(0.5) return root_list
def queryRelatedWos(WOS, start_time): root_list = [] with WosClient(c.getUserName(), c.getPassword()) as client: print("Starting to look for related records") for i, id in enumerate(WOS): try: records = wos.utils.get_related_records(client, id, count=8) roots = records.findall("REC") for root in roots: root_list.append(root) except: wait = 5 print("Some error while adding roots. Waiting {} sec".format( wait)) time.sleep(wait) if (i + 1) % 50 == 0: print("Queries so far is {}. Time used is {:0.1f}".format( (i + 1), ((time.time() - start_time) / 60.0))) time.sleep(0.5) return root_list
def queryWos(tcp_data, start, start_sample, end_sample): # Return a small list of titles, year and ids for testing # with open("data/wos_not_meta_data.json", "r") as f: # small_list_ids = json.load(f) # Create an empty list which should contain info later info = [] not_found = [] # Connect to Web of Science with WosClient(c.getUserName(), c.getPassword()) as client: print ("Starting the queries") # Looping through the titles (search parameter) for i, id in enumerate(tcp_data.index.values.tolist()[start_sample:end_sample]): # Replace '|' with ',' title = ( tcp_data.loc[id] .Title.replace("|", ",") .replace("?", "") .replace('"', "") .replace("/", " ") .replace("-", " ") .replace(":", " ") ) title = re.sub(r"\([^)]*\)", "", title) title = title.replace("(", "").replace(")", "") # Get year published year = tcp_data.loc[id].Year # Create year query with +/- 1 year query_string_year = "PY=(" + str(year - 1) + " OR " + str(year) + " OR " + str(year + 1) + ")" # Check if the title contains any operators (like AND, OR, NOT) if isOperator(title): title = removeOperator(title) # Create title query query_string_title = "TI=" + title # Create query AND operator string query_AND_operator = " AND " # Create the query string query_string = query_string_title + query_AND_operator + query_string_year # print query_string # Perform the query on wos engine root = None wait = 2 try: root = wos.utils.query_v2(client, query_string, count=1) except suds.WebFault: print "Suds.WebFault: Waiting {} sec".format(wait) print suds.WebFault.args time.sleep(wait) except: print "Some other error occured, sleep {} sec".format(wait) time.sleep(wait) if root is None: # Adding tuple with id and title not_found.append(id) print ("Did not find record with title {}".format(title)) print ("Not found length is {}".format(len(not_found))) else: # Adding dictionary tcp_data_title = tcp_data.loc[id].Title.replace("|", ",") tcp_data_title = re.sub(r"\([^)]*\)", "", tcp_data_title) tcp_data_title = removePunct(tcp_data_title) wos_title = getTitle(root) wos_title = re.sub(r"\([^)]*\)", "", wos_title) wos_title = removePunct(wos_title) print ("tcp title: {}".format(tcp_data_title)) print ("wos title: {}".format(wos_title)) print if difflib.SequenceMatcher(None, tcp_data_title, wos_title).ratio() > 0.95: if getAbstract(root) is not None: info.append((root, id)) print ("Successfully retrieved is now {}".format(len(info))) else: print print ("Abstract was none") print else: print ("titles not alike...") not_found.append(id) print ( "Number of queries so far is {}. Time used is {:0.1f} minutes".format( (i + 1), ((time.time() - start) / 60.0) ) ) time.sleep(0.5) return info, not_found
def queryWos(tcp_data, start, start_sample, end_sample): # Return a small list of titles, year and ids for testing #with open("data/wos_not_meta_data.json", "r") as f: # small_list_ids = json.load(f) # Create an empty list which should contain info later info = [] not_found = [] # Connect to Web of Science with WosClient(c.getUserName(), c.getPassword()) as client: print("Starting the queries") # Looping through the titles (search parameter) for i, id in enumerate(tcp_data.index.values.tolist()[start_sample:end_sample]): # Replace '|' with ',' title = tcp_data.loc[id].Title.replace("|",",").replace("?","").replace('"', '').replace("/"," ").replace("-", " ").replace(":", " ") title = re.sub(r'\([^)]*\)', '', title) title = title.replace("(","").replace(")","") # Get year published year = tcp_data.loc[id].Year # Create year query with +/- 1 year query_string_year = 'PY=(' + str(year-1) + ' OR ' + str(year) + ' OR ' + str(year+1) + ')' # Check if the title contains any operators (like AND, OR, NOT) if isOperator(title): title = removeOperator(title) # Create title query query_string_title = 'TI=' + title # Create query AND operator string query_AND_operator = ' AND ' # Create the query string query_string = query_string_title + query_AND_operator + query_string_year #print query_string # Perform the query on wos engine root = None wait = 2 try: root = wos.utils.query_v2(client, query_string, count=1) except suds.WebFault: print "Suds.WebFault: Waiting {} sec".format(wait) print suds.WebFault.args time.sleep(wait) except: print "Some other error occured, sleep {} sec".format(wait) time.sleep(wait) if root is None: # Adding tuple with id and title not_found.append(id) print("Did not find record with title {}".format(title)) print("Not found length is {}".format(len(not_found))) else: # Adding dictionary tcp_data_title = tcp_data.loc[id].Title.replace("|", ",") tcp_data_title = re.sub(r'\([^)]*\)', '', tcp_data_title) tcp_data_title = removePunct(tcp_data_title) wos_title = getTitle(root) wos_title = re.sub(r'\([^)]*\)', '', wos_title) wos_title = removePunct(wos_title) print("tcp title: {}".format(tcp_data_title)) print("wos title: {}".format(wos_title)) print if difflib.SequenceMatcher(None, tcp_data_title, wos_title).ratio() > 0.95: if getAbstract(root) is not None: info.append((root, id)) print("Successfully retrieved is now {}".format(len(info))) else: print print("Abstract was none") print else: print("titles not alike...") not_found.append(id) print("Number of queries so far is {}. Time used is {:0.1f} minutes".format((i+1),((time.time()-start)/60.0))) time.sleep(0.5) return info, not_found