Ejemplo n.º 1
0
def process_article():
    sqs = SQSClient(args.sqs_queue)
    s3 = S3Writer(args.s3_bucket)

    while not (shutdown and arq.article_queue.empty()):
        title, source = arq.article_queue.get()
        s3.save("article", title, source)
        sqs.put(S3Writer.clean(title))
Ejemplo n.º 2
0
def process_article():
    sqs = SQSClient(
        "https://sqs.eu-west-1.amazonaws.com/576699973142/fever-parse-jobs")
    s3 = S3Writer("com.amazon.evi.fever.wiki")

    while not (shutdown and arq.article_queue.empty()):
        title, source = arq.article_queue.get()
        s3.save("article", title, source)
        sqs.put(S3Writer.clean(title))
Ejemplo n.º 3
0
def get_sentence(sent_id):
    client = boto3.client("dynamodb")
    s3 = S3Writer("com.amazon.evi.fever.wiki")
    matching_items = client.query(
        TableName="FeverAnnotations",
        IndexName="sentence_id-index",
        Select="ALL_ATTRIBUTES",
        KeyConditionExpression="sentence_id = :v1",
        ExpressionAttributeValues={":v1": {
            "N": str(sent_id)
        }})

    claims_for_annotation = []
    if "Items" in matching_items:
        claims_for_annotation.extend(
            [prepare_item(item) for item in matching_items["Items"]])

    originals = {claim["original"] for claim in claims_for_annotation}
    claims = []

    for claim in claims_for_annotation:
        claims.append({"text": claim["mutation"]})

    for claim in originals:
        claims.append({"text": claim})

    entity = claims_for_annotation[0]["entity"]

    doc = s3.read_string("intro/" + s3.clean(entity))
    return jsonify({
        "entity": claims_for_annotation[0]["entity"],
        "body": doc,
        "claims": claims,
        "sentence": claims_for_annotation[0]["sentence"]
    })
Ejemplo n.º 4
0
def get_wiki_entry(name):
    name = name.strip()
    s3 = S3Writer("com.amazon.evi.fever.wiki")
    try:
        key = s3.clean("intro/" + name)

        body = s3.read_string(key)
        return {"text": body, "canonical_entity": name}
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "NoSuchKey":
            if name[0].islower():
                return get_wiki_entry(name[0].upper() + name[1:])
            else:
                try:
                    return get_wiki_entry(
                        recursive_redirect_lookup(redirects, redirects[name]))
                except RecursionError:
                    logger.error(
                        "Couldn't resolve {0} from dictionary: recursive redirect loop"
                        .format(name))
                    return None

                except KeyError:
                    logger.error("{0} has no redirect lookup".format(name))
                    return None
        else:
            logger.error(
                "Could not resolve {0} from dictionary because it doesnt exist"
                .format(name))
            return None
Ejemplo n.º 5
0
def get_wiki_entry(name):
    s3 = S3Writer(args.s3_bucket)
    try:
        key = s3.clean("intro/" + name)

        return {"text": s3.read_string(key), "canonical_entity": name}
    except:
        try:
            if name[0].islower():
                return get_wiki_entry(name[0].upper() + name[1:])
            else:
                return get_wiki_entry(
                    recursive_redirect_lookup(redirects, redirects[name]))
        except:
            return None
Ejemplo n.º 6
0
            else:
                return get_wiki_entry(
                    recursive_redirect_lookup(redirects, redirects[name]))
        except:
            return None


redirects = get_redirects()

pages_file = args.pages_file

pages = []
with open(pages_file, "r") as f:
    pages.extend([line.strip() for line in f.readlines()])

s3 = S3Writer(args.s3_bucket)

global_id = 0
extra_pages = set()

with open(args.out_pages, "r+") as f:
    extra_pages.update([line.strip() for line in f.readlines()])

done_pages = set()

live = []
if os.path.exists(args.out_file):
    with open(args.out_file, "r") as f:
        live = json.load(f)

for item in tqdm(live, desc="Loading existing claims"):
Ejemplo n.º 7
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import os
from botocore.exceptions import ClientError
from dataset.reader.wiki_parser import WikiParser
from persistence.s3_persistence import S3Writer



s3 = S3Writer(os.getenv("S3_BUCKET"))
parser = WikiParser(s3)


with open("data/pages.txt") as f:
    files = f.readlines()

files = [file.replace(" ","_").strip() for file in files]

for file in files:
    try:
        obj = s3.read("article/"+file)
        text = bytes.decode(obj['Body'].read())

        parser.article_callback(file,text)
    except ClientError:
Ejemplo n.º 8
0
from botocore.exceptions import ClientError

from dataset.reader.wiki_parser import WikiParser
from persistence.s3_persistence import S3Writer

s3 = S3Writer("com.amazon.evi.fever.wiki")
parser = WikiParser(s3)


with open("data/pages.txt") as f:
    files = f.readlines()

files = [file.replace(" ","_").strip() for file in files]

for file in files:
    try:
        obj = s3.read("article/"+file)
        text = bytes.decode(obj['Body'].read())

        parser.article_callback(file,text)
    except ClientError:
        print("CE" + file)
Ejemplo n.º 9
0
 def article_callback(self, title, source):
     self.sqs.send(S3Writer.clean(title))
     self.persistence.put("article", title, source)
Ejemplo n.º 10
0
def get_claims(data):
    claims = []
    documents = set()
    doc_ids = dict()
    for record in data:
        documents.add(record["entity"]["s"])

    i = 0

    bodies = dict()
    for doc in documents:
        doc_ids[doc] = i

        s3 = S3Writer("com.amazon.evi.fever.wiki")
        bodies[i] = "\n".join([
            line.split("\t")[1] if len(line.split()) > 1 else ""
            for line in bytes.decode(
                s3.read("intro_sentences/" +
                        s3.clean(doc))['Body'].read()).split("\n")
        ])
        i += 1

    done_ids = set()
    for record in data:
        original = record["original"]["s"]
        mutation_type = record["mutation_type"]["s"]
        mutation = record["mutation"]["s"]
        sentence = record["sentence"]["s"]
        entity = record["entity"]["s"]

        cor_id = record["correlation"]["s"]

        if cor_id not in done_ids:
            done_ids.add(cor_id)
            claims.append({
                "Headline": original,
                "Body ID": doc_ids[entity],
                "Stance": "agree"
            })

        if mutation_type == "rephrase" or mutation_type == "general" or mutation_type == "gen":
            claims.append({
                "Headline": mutation,
                "Body ID": doc_ids[entity],
                "Stance": "agree"
            })

        if mutation_type == "neg" or mutation_type == "negate":
            claims.append({
                "Headline": mutation,
                "Body ID": doc_ids[entity],
                "Stance": "disagree"
            })

        if mutation_type == "spec" or mutation_type == "specific":
            claims.append({
                "Headline": mutation,
                "Body ID": doc_ids[entity],
                "Stance": "discuss"
            })

        if mutation_type == "sim" or mutation_type == "dis" or mutation_type == "sub" or mutation_type == "substitute_similar" or mutation_type == "substitute_dissimilar":
            claims.append({
                "Headline": mutation,
                "Body ID": doc_ids[entity],
                "Stance": "unrelated"
            })

    return claims, bodies