コード例 #1
0
def gtv(path_taken, display=False):
    #print(path_taken)
    # slice the path
    s = time.time()
    if display:
        log(f"path_taken: '{path_taken}'")
    path = [Page(d, display=display) for d in path_taken.split('>')]
    source = path[0]  # get the first element of the path
    dest = path[len(path) - 1]  # get the last element of the path
    #----------------------------------------------------------------------
    documents = [source.pagecontent, dest.pagecontent]
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')

    def cos_similarity(textlist):
        tfidf = TfidfVec.fit_transform(textlist)
        return (tfidf * tfidf.T).toarray()

    cos_sim = cos_similarity(documents)
    # ---------------------------------------------------------------------
    cos_sim = cos_sim[0][1]
    truthvalue = 1.00
    for x in path[1:]:
        print(f"{x.backlinkcount}*{cos_sim}")
        truthvalue += math.log(x.backlinkcount * (cos_sim))
    # ---------------------------------------------------------------------
    truthvalue = 1 / truthvalue
    truthvalue *= 0.5
    truthvalue += cos_sim * 0.5
    return truthvalue
コード例 #2
0
def cfc_get_truth_value(x, y, token):
    s = time.time()
    path = [
        x.replace('_', ' ') for x in retrieve_path(x.strip(), y.strip(), token)
    ]  #$ehdnh

    if len(path) == 0:
        errorlog(
            "ERROR: Could not retrieve path. Check page title spelling and make sure token has not expired."
        )
        path = ("undefined")
    else:
        path = stringify_path(path)
        log("Found path {}. Took {:.2f}s.".format(path, s - time.time()))

    score = get_truth_value(path, display=False)
    return score

    return cfc_get_truth_value(x, y, token)
コード例 #3
0
    def __init__(self, title=None, pageid=None, redirect=True, preload=False, original_title='', backlinksonly = False):
        start_time = time.time()
        log("Truth.page: Creating page object for {}".format(title))
        # Try to get the data from the database first.
        # If record does not exist, value will be None.
        # If none, then run the WikipediaPage constructor and save the data.
        page_db_rows = dh.get_page(title)

        if not page_db_rows:    # if the page does not exist in the database
            log("Truth.page: No existing data found for {} in the database. Retrieving from API.".format(title))
            d = page(title=title, pageid=pageid, auto_suggest=True, redirect=True, preload=False)   # load a WikipediaPage object and just assign its attributes to self. (hax)
            self.title = title
            self.pageid = d.pageid
            self.backlinkcount = self.api_retrievebacklinkcount()   # retrieve backlinks using dispenser's API
            if not backlinksonly:   # if user wants all content to be loaded (not backlinks only)
                self.pagecontent = d.content    # retrieve from API (this uses lazy loading)
            else:
                self.pagecontent=None   # do not retrieve data
            self.remember() # save the data to the database

        else:   #if the data doesnt EXISTS in the database
            log("Truth.page: Data for {} found in database. Retrieving from database.".format(title))
            self.title = page_db_rows[0][0]
            if not backlinksonly:   # if user wants all the data then,
                if not page_db_rows[0][1]:  # if page is present, but content is NULL (e.g. page was previously loaded but backlinksonly=True)
                    log("Retrieving Content (Page is in DB, but content is not loaded).")
                    d = page(title=title, pageid=pageid, auto_suggest=True, redirect=True, preload=False)   # load a wikipage object and assign its atrributes to self
                    self.pageid = d.pageid
                    self.pagecontent = d.content
                    self.update()   # remember the downloaded content in the database
                else:   # else if content is present then just simply set the content row to self.pagecontent.
                    self.pagecontent = page_db_rows[0][1]   
            else:
                self.pagecontent = None # if user wants backlinks only then set pagecontent to NULL.
            self.backlinkcount = page_db_rows[0][2] # load backlink count from db
        log("Truth.page: Successfully created "+repr(self) + ", finished {:.5f}s".format(time.time() - start_time))
コード例 #4
0
    def __init__(self, source, dest, pathonly=False):

        page_db_rows = dh.get_path(source, dest)

        if not page_db_rows:  # query if it is not in database
            log(f"Querying '{source}' -> '{dest}' from API.")
            self.titles = [
                x.replace('_', ' ')
                for x in get_path(source.strip(), dest.strip(), TOKEN)
            ]

            # Catch the possible errors
            if len(self.titles) == 1:  # Error No. 1: Page doesn't exist.
                errorlog(self.titles[0])
                raise (Exception(self.titles[0]))
            elif len(
                    self.titles) == 0:  # Error No. 2: Access token is invalid.
                errorlog("invalid access token")
                raise (Exception("Invalid Access token"))

            self.pathstring = ">".join(self.titles)
            self.nodes = []

            log(f"Found nodes {self.titles}")
            self.source = self.titles[0]
            self.dest = self.titles[len(self.nodes) - 1]
            self.remember()
        else:
            log("Truth.Path: Path Found in database. Loading from DB.")
            self.source = page_db_rows[0][0]
            self.dest = page_db_rows[0][1]
            self.pathstring = page_db_rows[0][2]
            self.cosine_similarity = page_db_rows[0][3]
            self.titles = self.pathstring.split('>')
        if not pathonly:
            for idx, x in enumerate(self.titles):
                if idx == 0 or idx == len(self.titles) - 1:
                    self.nodes.append(Page(x, backlinksonly=False))
                else:
                    self.nodes.append(Page(x, backlinksonly=True))
        log(repr(self))
コード例 #5
0

def stringify_path(lst):
    return ">".join(lst)


x_entities = []
y_entities = []
output_file = "xy.txt"
is_ideologies = False
token = ""
error_threshold = None

if __name__ == '__main__':
    # Read the parameters
    log("PathFinder.py started.")
    parser = argparse.ArgumentParser(
        description='Find paths taken between articles in wikipedia.')
    parser.add_argument(
        'x',
        help='Path to file containing x-axis entities, separated by newlines.')
    parser.add_argument(
        'y',
        help='Path to file containing y-axis entities, separated by newlines.')
    parser.add_argument('output_file', help='Where to output csv file.')
    parser.add_argument(
        'token', help="Access token to be used when querying wikipaths.")
    parser.add_argument('-e',
                        '--error-threshold',
                        dest="error_threshold",
                        default=5)
コード例 #6
0
import Truth
from Truth import Path

from scrapers.request import get_path
from scrapers.logger import log
import argparse


if __name__ == '__main__':

    # Read the parameters
    log("PathEvaluator.py started.")
    parser = argparse.ArgumentParser(description='All in one package (pathfinding + truthvalue)')
    parser.add_argument('x', help='Source Node')
    parser.add_argument('y', help='End Node')
    parser.add_argument('token', help="Access token to be used when querying wikipaths.")
    
    args = vars(parser.parse_args())


    Truth.TOKEN = args['token']
    b = Path(args['x'], args['y'])
    print(b.get_truthvalue())
コード例 #7
0
"""GREC PATHFINDER
This will take in a csv, 
and output a list of paths.
"""
import pandas as pd
from scrapers.logger import log
import csv
from PathFinder import retrieve_path, stringify_path
import time
import argparse

if __name__ == "__main__":

    log("Starting grec_pathfinder.py")
    parser = argparse.ArgumentParser(
        description=
        'Find paths taken between articles in wikipedia using data outputted by grec_pathfinder.py. Good luck.'
    )
    parser.add_argument('output_tag', help='tag you used in grec_extractor.py')
    parser.add_argument(
        'token', help="Access token to be used when querying wikipaths.")
    args = vars(parser.parse_args())

    token = args['token']
    filename = "gt_with_paths.csv"
    outputfile = f"data/GREC/{args['output_tag']}/{filename}"

    csvfile = open(outputfile, 'w', encoding='utf-8', newline="")
    gt = f"data/GREC/{args['output_tag']}/ground_truth.csv"
    a = pd.read_csv(gt, sep=';')
コード例 #8
0
"""
Authors:
Ibalio, Jan Leryc
Alfafara, Lean
This is the backend for the C# fact checker program.
"""

import sys
from scrapers.logger import errorlog, log
import time
from PathFinder import retrieve_path, stringify_path
from Truth import get_truth_value
import argparse
"""
log("Starting grec_pathfinder.py")
parser = argparse.ArgumentParser(description='Find paths taken between articles in wikipedia using data outputted by grec_pathfinder.py. Good luck.')
parser.add_argument('sub1', help='Subject 1')
parser.add_argument('sub2', help="Subject 2")
parser.add_argument('sub3', help='Subject 3')
parser.add_argument('obj', help="Object")
args = vars(parser.parse_args())
"""

#1539052726|00d724bfd2726fd20bd241b1dc8ab49f

token = "1539072664|1cb61f76cefb672382503def95d2773d"


def cfc_get_truth_value(x, y, token):
    s = time.time()
    path = [
コード例 #9
0
 def update(self):
     dh.update(self)
     log("Truth.page: Remembered page {} in database".format(self.title))
コード例 #10
0
 def remember(self):
     dh.insert_page(self)
     log("Truth.page: Remembered page {} in database.".format(self.title))
コード例 #11
0
 def remember(self):
     dh.insert_path(self)
     log("Remembering path {} in database.".format(self.pathstring))
     pass
コード例 #12
0
    r = None
    try:
        r = re.findall(
            r'(?<=<span class="wb-itemlink-label" lang="en" dir="ltr">).+(?=<\/span> <span class="wb-itemlink-id">)',
            response)[0]
    except:
        r = None
    return r


import time
from scrapers.logger import log, errorlog

if __name__ == "__main__":

    log("The GREC genie has been freed from the bottle. Starting grec_extractor.py"
        )
    parser = argparse.ArgumentParser(
        description='Find paths taken between articles in wikipedia.')
    parser.add_argument('grec_path',
                        help='path to GREC json file to extract data from.')
    parser.add_argument(
        'output_folder',
        help=
        'NAME OF THE OUTPUT FOLDER. Not a path. All outputs will be stored in data/GREC by default.'
    )
    args = vars(parser.parse_args())

    # Get the command line arguments.
    source = args['grec_path']
    out = args['output_folder']