def query_wikipedia(*, query: str, logger: logging.Logger, sentences: int = 10) -> None: """Log a Wikipedia summary of length `sentences` for the given `query` or raise an error""" try: logger.debug("Searching Wikipedia for %s", query, extra={"traceback": get_traceback()}) # The wikipedia package has not configured bs4 correctly and causes a warning with warnings.catch_warnings(): warnings.simplefilter("ignore") summary = wikipedia.summary(query, sentences=sentences) except ( wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError, ) as e: logger.error( "Encountered an error searching Wikipedia for %s: %s", query, e, extra={"traceback": get_traceback()}, ) UNKNOWN.add(query) else: logger.info( "Found summary information for %s: %s", query, summary, extra={"traceback": get_traceback()}, ) KNOWN.add(query)
def analyse(text: str, outpath: pathlib.Path) -> None: """Runs all text-related tasks""" logger = get_logger(__name__, outpath) logger.debug("Analysing orbit text with spaCy", extra={"traceback": get_traceback()}) doc = NLP(text) for ent in doc.ents: if (ent.label_ == "ORG" ): # Most of the nouns we care about get classified as 'ORG' logger.debug(f"Found {ent.text}", extra={"traceback": get_traceback()}) ORGS.add(ent.text) safe_text = ent.text.lower() if safe_text in KNOWN or safe_text in words.words(): logger.debug( "I know what %s means, I have seen it before", ent.text, extra={"traceback": get_traceback()}, ) continue elif safe_text in UNKNOWN: logger.debug( "I have seen %s before but can't work out what it means", ent.text, extra={"traceback": get_traceback()}, ) continue else: query_wikipedia(query=safe_text, logger=logger) if safe_text in UNKNOWN: # Wikipedia was not helpful, try Wiktionary query_wiktionary(query=safe_text, logger=logger)
def analyse_orbit(orbit: dict) -> None: """Analyses the orbit data and logs it to the correct file""" directory = pathlib.Path(str(orbit["number"])) outpath = directory.joinpath("log.html") if not os.path.exists(directory): os.makedirs(directory) with open(outpath, "w") as fh: date_string = format_date_string(orbit["start"], orbit["finish"]) intro_string = f""" <section> <h2>Orbit {orbit['number']} — {date_string}</h2>""" orbit_notes = format_orbit_notes(orbit["notes"]) fh.write(intro_string + orbit_notes) logger = get_logger(__name__, outpath) logger.debug( "Analysing the text for orbit %s", orbit["number"], extra={"traceback": get_traceback()}, ) analyse_text(orbit["notes"], outpath) analyse_images(orbit, outpath) with open(outpath, "a") as fh: fh.write("</section>")
def query_wiktionary(*, query: str, logger: logging.Logger) -> None: """Log a definition from Wiktionary for a given `query`""" logger.debug("Searching Wiktionary for %s", query, extra={"traceback": get_traceback()}) word = PARSER.fetch(query) if len(word[0]["definitions"]): logger.info( "Found definitions for %s: %s", query, [definition["text"] for definition in word[0]["definitions"]], extra={"traceback": get_traceback()}, ) KNOWN.add(query) else: logger.info( "Found nothing in Wiktionary for %s", query, extra={"traceback": get_traceback()}, ) UNKNOWN.add(query)
def read(self, request, creator_id=None, annotation_id=None, *args, **kwargs): def create_date(s): reg = ( r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})" "T(?P<hour>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})Z" ) match = re.match(reg, s) mch = lambda d: int(match.group(d)) return datetime.datetime( year=mch("year"), month=mch("month"), day=mch("day"), hour=mch("hour"), minute=mch("minutes") ) includeDeletions = True if request.GET.get("includeDeletions", 0) == "true" else False active = Annotation.annotations.active() annotations = active if includeDeletions else active.filter(deleted=False) def filter_by_creator_id(request, qs, creator_id): log.info("Filter by creator id") if request.user.is_authenticated(): # exclude from queryset annotations which are private # and are not owned by registered user qs = qs.exclude(~Q(author=request.user), private=True) else: # if user is not logged in exclude all private annotations qs = qs.exclude(private=True) # NOTE: Django user id has low priority before drupal user id # Drupal user? we need to remove this. if creator_id: # user = (get_object_or_None(Profile, username=self.fake_username(creator_id)) or get_object_or_None(Profile, id=creator_id)) user = get_object_or_None(Profile, username=self.fake_username(creator_id)) or get_object_or_None( Profile, id=creator_id ) if not user: raise NotFoundError("User not found") qs = qs.filter(author=user) if user else qs return qs def filter_by_annotation_id(request, qs, annotation_id, filtered_by_creator=False, creator_id=None): log.info("Filter by annotation id") """This function recieves queryset filtered by user and filters it by anntation id.""" if not annotation_id: return qs if not filtered_by_creator and creator_id: qs = filter_by_creator_id(request, qs, creator_id) kw = dict(id=annotation_id) if includeDeletions: kw.setdefault("deleted", True) if bool(request.GET.get("nested", False)): annotation = qs.get(**kw) return qs.filter( target__url__in=[t.url for t in annotation.target.all()], has_answers=False, deleted=True if includeDeletions else False, ).exclude(id=annotation.id) return qs.filter(**kw) # # Filter by constraint # date: oldest, newest # targetUri: URI # start: number # limit: number # def filter_by_constraint(request, qs): constraints = { "oldest": lambda v: ("creation_date__lte", create_date(v)), "newest": lambda v: ("creation_date__gte", create_date(v)), "targetUri": lambda v: ("target__url__icontains", v), } q = dict(q_rep(request.GET.get(con)) for con, q_rep in constraints.items() if con in request.GET) if q: qs = qs.filter(**q) return qs def filter_by_limit(request, qs): try: start = int(request.GET.get("start", 0)) limit = int(request.GET.get("limit", request.GET.get("rows", 50))) end = start + limit except ValueError: start = 0 end = 50 return qs.order_by("creation_date")[start:end] def inGET(name): return name in request.GET try: if creator_id: annotations = filter_by_creator_id(request, annotations, creator_id) if annotation_id: annotations = filter_by_annotation_id(request, annotations, annotation_id) annotations = filter_by_constraint(request, annotations) return filter_by_limit(request, annotations) except NotFoundError as e: log.info(u"Error in AnnotationHandler.read(). Error was " + get_traceback()) raise e except Exception as e: log.info(u"Error in AnnotationHandler.read(). Error was " + get_traceback()) raise e
def create(self, request, creator_id, **kwargs): def get_user(request, creator=None): if creator and request.user.is_anonymous(): return Profile.objects.get_or_create(drupal_uid=creator, username=self.fake_username(creator))[0] elif request.user.is_authenticated(): return request.user else: return { "error": ( "Can not create annotation with fully annonymous user. Please login or provide drupal_uid." ) } # # Saves constraints and returns saved constraints instances. # argument "field_name" define where constraints are locate in request.POST # def save_constraints(request, annotation, field_name="ranges"): if field_name in request.POST: print "savin' constraints {0}".format(request.POST.get(field_name, "No ranges in request.POST")) # save constraints if exists def save_one(data): form = ConstraintForm(data) if form.is_valid(): const = form.save(commit=False) const.annotation = annotation const.save() const.target = annotation.target.all() return const else: return return [save_one(i) for i in request.POST[field_name] if isinstance(i, dict)] try: creator_id = get_user(request, creator_id) annotation = request.form.save(commit=False) annotation.author = creator_id annotation.private = 0 if creator_id.drupal_uid else request.form.cleaned_data["private"] annotation.save() request.form.save_m2m() constraints = save_constraints(request, annotation) qs = Annotation.objects.filter( target__url__in=[t.url for t in annotation.target.all()], has_answers=False, deleted=False ).exclude(id=annotation.id) if annotation.type == "Reply" and qs.count(): qs.update(has_answers=True) return annotation except Exception as e: log.error( "Errow while creating new one annotation. Error was {0}, traceback: {1}".format(e, get_traceback()) ) return e