def gtv(path_taken, display=False): #print(path_taken) # slice the path s = time.time() if display: log(f"path_taken: '{path_taken}'") path = [Page(d, display=display) for d in path_taken.split('>')] source = path[0] # get the first element of the path dest = path[len(path) - 1] # get the last element of the path #---------------------------------------------------------------------- documents = [source.pagecontent, dest.pagecontent] TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english') def cos_similarity(textlist): tfidf = TfidfVec.fit_transform(textlist) return (tfidf * tfidf.T).toarray() cos_sim = cos_similarity(documents) # --------------------------------------------------------------------- cos_sim = cos_sim[0][1] truthvalue = 1.00 for x in path[1:]: print(f"{x.backlinkcount}*{cos_sim}") truthvalue += math.log(x.backlinkcount * (cos_sim)) # --------------------------------------------------------------------- truthvalue = 1 / truthvalue truthvalue *= 0.5 truthvalue += cos_sim * 0.5 return truthvalue
def cfc_get_truth_value(x, y, token): s = time.time() path = [ x.replace('_', ' ') for x in retrieve_path(x.strip(), y.strip(), token) ] #$ehdnh if len(path) == 0: errorlog( "ERROR: Could not retrieve path. Check page title spelling and make sure token has not expired." ) path = ("undefined") else: path = stringify_path(path) log("Found path {}. Took {:.2f}s.".format(path, s - time.time())) score = get_truth_value(path, display=False) return score return cfc_get_truth_value(x, y, token)
def __init__(self, title=None, pageid=None, redirect=True, preload=False, original_title='', backlinksonly = False): start_time = time.time() log("Truth.page: Creating page object for {}".format(title)) # Try to get the data from the database first. # If record does not exist, value will be None. # If none, then run the WikipediaPage constructor and save the data. page_db_rows = dh.get_page(title) if not page_db_rows: # if the page does not exist in the database log("Truth.page: No existing data found for {} in the database. Retrieving from API.".format(title)) d = page(title=title, pageid=pageid, auto_suggest=True, redirect=True, preload=False) # load a WikipediaPage object and just assign its attributes to self. (hax) self.title = title self.pageid = d.pageid self.backlinkcount = self.api_retrievebacklinkcount() # retrieve backlinks using dispenser's API if not backlinksonly: # if user wants all content to be loaded (not backlinks only) self.pagecontent = d.content # retrieve from API (this uses lazy loading) else: self.pagecontent=None # do not retrieve data self.remember() # save the data to the database else: #if the data doesnt EXISTS in the database log("Truth.page: Data for {} found in database. Retrieving from database.".format(title)) self.title = page_db_rows[0][0] if not backlinksonly: # if user wants all the data then, if not page_db_rows[0][1]: # if page is present, but content is NULL (e.g. page was previously loaded but backlinksonly=True) log("Retrieving Content (Page is in DB, but content is not loaded).") d = page(title=title, pageid=pageid, auto_suggest=True, redirect=True, preload=False) # load a wikipage object and assign its atrributes to self self.pageid = d.pageid self.pagecontent = d.content self.update() # remember the downloaded content in the database else: # else if content is present then just simply set the content row to self.pagecontent. self.pagecontent = page_db_rows[0][1] else: self.pagecontent = None # if user wants backlinks only then set pagecontent to NULL. self.backlinkcount = page_db_rows[0][2] # load backlink count from db log("Truth.page: Successfully created "+repr(self) + ", finished {:.5f}s".format(time.time() - start_time))
def __init__(self, source, dest, pathonly=False): page_db_rows = dh.get_path(source, dest) if not page_db_rows: # query if it is not in database log(f"Querying '{source}' -> '{dest}' from API.") self.titles = [ x.replace('_', ' ') for x in get_path(source.strip(), dest.strip(), TOKEN) ] # Catch the possible errors if len(self.titles) == 1: # Error No. 1: Page doesn't exist. errorlog(self.titles[0]) raise (Exception(self.titles[0])) elif len( self.titles) == 0: # Error No. 2: Access token is invalid. errorlog("invalid access token") raise (Exception("Invalid Access token")) self.pathstring = ">".join(self.titles) self.nodes = [] log(f"Found nodes {self.titles}") self.source = self.titles[0] self.dest = self.titles[len(self.nodes) - 1] self.remember() else: log("Truth.Path: Path Found in database. Loading from DB.") self.source = page_db_rows[0][0] self.dest = page_db_rows[0][1] self.pathstring = page_db_rows[0][2] self.cosine_similarity = page_db_rows[0][3] self.titles = self.pathstring.split('>') if not pathonly: for idx, x in enumerate(self.titles): if idx == 0 or idx == len(self.titles) - 1: self.nodes.append(Page(x, backlinksonly=False)) else: self.nodes.append(Page(x, backlinksonly=True)) log(repr(self))
def stringify_path(lst): return ">".join(lst) x_entities = [] y_entities = [] output_file = "xy.txt" is_ideologies = False token = "" error_threshold = None if __name__ == '__main__': # Read the parameters log("PathFinder.py started.") parser = argparse.ArgumentParser( description='Find paths taken between articles in wikipedia.') parser.add_argument( 'x', help='Path to file containing x-axis entities, separated by newlines.') parser.add_argument( 'y', help='Path to file containing y-axis entities, separated by newlines.') parser.add_argument('output_file', help='Where to output csv file.') parser.add_argument( 'token', help="Access token to be used when querying wikipaths.") parser.add_argument('-e', '--error-threshold', dest="error_threshold", default=5)
import Truth from Truth import Path from scrapers.request import get_path from scrapers.logger import log import argparse if __name__ == '__main__': # Read the parameters log("PathEvaluator.py started.") parser = argparse.ArgumentParser(description='All in one package (pathfinding + truthvalue)') parser.add_argument('x', help='Source Node') parser.add_argument('y', help='End Node') parser.add_argument('token', help="Access token to be used when querying wikipaths.") args = vars(parser.parse_args()) Truth.TOKEN = args['token'] b = Path(args['x'], args['y']) print(b.get_truthvalue())
"""GREC PATHFINDER This will take in a csv, and output a list of paths. """ import pandas as pd from scrapers.logger import log import csv from PathFinder import retrieve_path, stringify_path import time import argparse if __name__ == "__main__": log("Starting grec_pathfinder.py") parser = argparse.ArgumentParser( description= 'Find paths taken between articles in wikipedia using data outputted by grec_pathfinder.py. Good luck.' ) parser.add_argument('output_tag', help='tag you used in grec_extractor.py') parser.add_argument( 'token', help="Access token to be used when querying wikipaths.") args = vars(parser.parse_args()) token = args['token'] filename = "gt_with_paths.csv" outputfile = f"data/GREC/{args['output_tag']}/{filename}" csvfile = open(outputfile, 'w', encoding='utf-8', newline="") gt = f"data/GREC/{args['output_tag']}/ground_truth.csv" a = pd.read_csv(gt, sep=';')
""" Authors: Ibalio, Jan Leryc Alfafara, Lean This is the backend for the C# fact checker program. """ import sys from scrapers.logger import errorlog, log import time from PathFinder import retrieve_path, stringify_path from Truth import get_truth_value import argparse """ log("Starting grec_pathfinder.py") parser = argparse.ArgumentParser(description='Find paths taken between articles in wikipedia using data outputted by grec_pathfinder.py. Good luck.') parser.add_argument('sub1', help='Subject 1') parser.add_argument('sub2', help="Subject 2") parser.add_argument('sub3', help='Subject 3') parser.add_argument('obj', help="Object") args = vars(parser.parse_args()) """ #1539052726|00d724bfd2726fd20bd241b1dc8ab49f token = "1539072664|1cb61f76cefb672382503def95d2773d" def cfc_get_truth_value(x, y, token): s = time.time() path = [
def update(self): dh.update(self) log("Truth.page: Remembered page {} in database".format(self.title))
def remember(self): dh.insert_page(self) log("Truth.page: Remembered page {} in database.".format(self.title))
def remember(self): dh.insert_path(self) log("Remembering path {} in database.".format(self.pathstring)) pass
r = None try: r = re.findall( r'(?<=<span class="wb-itemlink-label" lang="en" dir="ltr">).+(?=<\/span> <span class="wb-itemlink-id">)', response)[0] except: r = None return r import time from scrapers.logger import log, errorlog if __name__ == "__main__": log("The GREC genie has been freed from the bottle. Starting grec_extractor.py" ) parser = argparse.ArgumentParser( description='Find paths taken between articles in wikipedia.') parser.add_argument('grec_path', help='path to GREC json file to extract data from.') parser.add_argument( 'output_folder', help= 'NAME OF THE OUTPUT FOLDER. Not a path. All outputs will be stored in data/GREC by default.' ) args = vars(parser.parse_args()) # Get the command line arguments. source = args['grec_path'] out = args['output_folder']