def process_article(): sqs = SQSClient(args.sqs_queue) s3 = S3Writer(args.s3_bucket) while not (shutdown and arq.article_queue.empty()): title, source = arq.article_queue.get() s3.save("article", title, source) sqs.put(S3Writer.clean(title))
def process_article(): sqs = SQSClient( "https://sqs.eu-west-1.amazonaws.com/576699973142/fever-parse-jobs") s3 = S3Writer("com.amazon.evi.fever.wiki") while not (shutdown and arq.article_queue.empty()): title, source = arq.article_queue.get() s3.save("article", title, source) sqs.put(S3Writer.clean(title))
def get_sentence(sent_id): client = boto3.client("dynamodb") s3 = S3Writer("com.amazon.evi.fever.wiki") matching_items = client.query( TableName="FeverAnnotations", IndexName="sentence_id-index", Select="ALL_ATTRIBUTES", KeyConditionExpression="sentence_id = :v1", ExpressionAttributeValues={":v1": { "N": str(sent_id) }}) claims_for_annotation = [] if "Items" in matching_items: claims_for_annotation.extend( [prepare_item(item) for item in matching_items["Items"]]) originals = {claim["original"] for claim in claims_for_annotation} claims = [] for claim in claims_for_annotation: claims.append({"text": claim["mutation"]}) for claim in originals: claims.append({"text": claim}) entity = claims_for_annotation[0]["entity"] doc = s3.read_string("intro/" + s3.clean(entity)) return jsonify({ "entity": claims_for_annotation[0]["entity"], "body": doc, "claims": claims, "sentence": claims_for_annotation[0]["sentence"] })
def get_wiki_entry(name): name = name.strip() s3 = S3Writer("com.amazon.evi.fever.wiki") try: key = s3.clean("intro/" + name) body = s3.read_string(key) return {"text": body, "canonical_entity": name} except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "NoSuchKey": if name[0].islower(): return get_wiki_entry(name[0].upper() + name[1:]) else: try: return get_wiki_entry( recursive_redirect_lookup(redirects, redirects[name])) except RecursionError: logger.error( "Couldn't resolve {0} from dictionary: recursive redirect loop" .format(name)) return None except KeyError: logger.error("{0} has no redirect lookup".format(name)) return None else: logger.error( "Could not resolve {0} from dictionary because it doesnt exist" .format(name)) return None
def get_wiki_entry(name): s3 = S3Writer(args.s3_bucket) try: key = s3.clean("intro/" + name) return {"text": s3.read_string(key), "canonical_entity": name} except: try: if name[0].islower(): return get_wiki_entry(name[0].upper() + name[1:]) else: return get_wiki_entry( recursive_redirect_lookup(redirects, redirects[name])) except: return None
else: return get_wiki_entry( recursive_redirect_lookup(redirects, redirects[name])) except: return None redirects = get_redirects() pages_file = args.pages_file pages = [] with open(pages_file, "r") as f: pages.extend([line.strip() for line in f.readlines()]) s3 = S3Writer(args.s3_bucket) global_id = 0 extra_pages = set() with open(args.out_pages, "r+") as f: extra_pages.update([line.strip() for line in f.readlines()]) done_pages = set() live = [] if os.path.exists(args.out_file): with open(args.out_file, "r") as f: live = json.load(f) for item in tqdm(live, desc="Loading existing claims"):
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from botocore.exceptions import ClientError from dataset.reader.wiki_parser import WikiParser from persistence.s3_persistence import S3Writer s3 = S3Writer(os.getenv("S3_BUCKET")) parser = WikiParser(s3) with open("data/pages.txt") as f: files = f.readlines() files = [file.replace(" ","_").strip() for file in files] for file in files: try: obj = s3.read("article/"+file) text = bytes.decode(obj['Body'].read()) parser.article_callback(file,text) except ClientError:
from botocore.exceptions import ClientError from dataset.reader.wiki_parser import WikiParser from persistence.s3_persistence import S3Writer s3 = S3Writer("com.amazon.evi.fever.wiki") parser = WikiParser(s3) with open("data/pages.txt") as f: files = f.readlines() files = [file.replace(" ","_").strip() for file in files] for file in files: try: obj = s3.read("article/"+file) text = bytes.decode(obj['Body'].read()) parser.article_callback(file,text) except ClientError: print("CE" + file)
def article_callback(self, title, source): self.sqs.send(S3Writer.clean(title)) self.persistence.put("article", title, source)
def get_claims(data): claims = [] documents = set() doc_ids = dict() for record in data: documents.add(record["entity"]["s"]) i = 0 bodies = dict() for doc in documents: doc_ids[doc] = i s3 = S3Writer("com.amazon.evi.fever.wiki") bodies[i] = "\n".join([ line.split("\t")[1] if len(line.split()) > 1 else "" for line in bytes.decode( s3.read("intro_sentences/" + s3.clean(doc))['Body'].read()).split("\n") ]) i += 1 done_ids = set() for record in data: original = record["original"]["s"] mutation_type = record["mutation_type"]["s"] mutation = record["mutation"]["s"] sentence = record["sentence"]["s"] entity = record["entity"]["s"] cor_id = record["correlation"]["s"] if cor_id not in done_ids: done_ids.add(cor_id) claims.append({ "Headline": original, "Body ID": doc_ids[entity], "Stance": "agree" }) if mutation_type == "rephrase" or mutation_type == "general" or mutation_type == "gen": claims.append({ "Headline": mutation, "Body ID": doc_ids[entity], "Stance": "agree" }) if mutation_type == "neg" or mutation_type == "negate": claims.append({ "Headline": mutation, "Body ID": doc_ids[entity], "Stance": "disagree" }) if mutation_type == "spec" or mutation_type == "specific": claims.append({ "Headline": mutation, "Body ID": doc_ids[entity], "Stance": "discuss" }) if mutation_type == "sim" or mutation_type == "dis" or mutation_type == "sub" or mutation_type == "substitute_similar" or mutation_type == "substitute_dissimilar": claims.append({ "Headline": mutation, "Body ID": doc_ids[entity], "Stance": "unrelated" }) return claims, bodies