def file(self, lib_object: JSON, file: File) -> File: cutoff = self.utils.filename_length_cutoff if lib_object.get("fileName"): filename = lib_object.get("fileName") elif lib_object.get("name"): extension = mimetypes.guess_extension("application/pdf") or "" length = cutoff - len(extension) filename = slugify(lib_object.get("name"))[:length] + extension else: access_url = lib_object["accessUrl"] filename = slugify(access_url.split("/")[-1])[-cutoff:] file.name = lib_object.get("name", "") if len(file.name) > 200: file.name = textwrap.wrap(file.name, 199)[0] + "\u2026" file.filename = filename file.mime_type = lib_object.get("mimeType") or "application/octet-stream" file.legal_date = self.utils.parse_date(lib_object.get("date")) file.sort_date = ( self.utils.date_to_datetime(file.legal_date) or self.utils.parse_datetime(lib_object.get("created")) or timezone.now() ) file.oparl_access_url = lib_object.get("accessUrl") file.oparl_download_url = lib_object.get("downloadUrl") file.filesize = None file.parsed_text = lib_object.get("text") file.license = lib_object.get("fileLicense") # We current do not handle locations attached to files due # to the lack of data and our own location extraction return file
def build_party_group_geo(): groups = json.load(open('group_parties.json', 'r')) G = nx.Graph() for i, group in enumerate(groups): parties = groups[group] party_list = [] for party in parties: name = slugify.slugify(party['party']) party_list.append(name) lat = float(party['data']['lat']) lng = float(party['data']['lon']) lat, lng = scatter(lat, lng) G.add_node(name, party_name=party['party'], group_id=i, group=group, group_name=party['data']['group_name'], city=party['data']['city'], lat=lat, lng=lng) for i in range(0, len(party_list)): for j in range(i + 1, len(party_list)): G.add_edge(party_list[i], party_list[j]) print 'Nodes:', len(G.nodes()) print 'Edges:', len(G.edges()) nx.write_gexf(G, './sna/party_group_geo.gexf')
def slugify(text, **options): # There is a conflict between python-slugify and awesome-slugify # So we import from a proper module manually # Import from slugify.slugify import slugify # Slugify slug = slugify(text, **options) return slug
def process_item(self, item, spider): """Extracts and convret date to correspond to expected by items.py (Deck, VStats, CardsInDeck)""" deck_cleaned = {} deck_cleaned["deck_url"] = item["deck_url"] price = re.match(self.float_pat, item["price"]) deck_cleaned["price"] = float(price.group()) if price else None metashare = re.match(self.float_pat, item["metashare"]) deck_cleaned["metashare"] = (float(metashare.group()) / 100 if metashare else None) global_performance = re.match(self.float_pat, item["global_performance"]) deck_cleaned["global_performance"] = ( float(global_performance.group()) / 100 if global_performance else None) # extract date from era, break into two, and pop era era_begin, era_end = re.findall(self.short_date_pat, item["era"]) deck_cleaned["era_begin"] = (datetime.strptime(era_begin, "%d %b %Y") if era_begin else None) deck_cleaned["era_end"] = (datetime.strptime(era_end, "%d %b %Y") if era_end else None) # Process each card in the decklist cleaned_cards = [] for card in item["cards_in_deck"]: new_card = {} new_card["deck_url"] = item["deck_url"] new_card["main"] = bool(int(card.pop("data-main"))) new_card["quantity"] = int(card.pop("data-qt")) new_card["card_name"] = card.pop("data-name") new_card["card_slug"] = slugify(new_card["card_name"], separator="_") cleaned_cards.append(new_card) deck_cleaned["cards_in_deck"] = cleaned_cards # Process each versus stat from the deck cleaned_vs = [] for vs in item["vs_stats"]: new_vs = {} new_vs["deck_url"] = item["deck_url"] new_vs["vs_deck_url"] = vs.pop("vs_deck_url") matches = re.match(self.int_pat, vs.pop("matches")) new_vs["matches"] = int(matches.group()) if matches else 0 new_vs["performance"] = float( re.match(self.float_pat, vs.pop("data-perf")).group()) cleaned_vs.append(new_vs) deck_cleaned["vs_stats"] = cleaned_vs logger.debug("DEBUGGING item cleaned:") logger.debug(deck_cleaned) return deck_cleaned
def file(self, libobject: OParl.File): file = self.check_existing(libobject, File, add_defaults=False) if not file: return logging.info("Processing File {}".format(libobject.get_id())) if libobject.get_file_name(): displayed_filename = libobject.get_file_name() elif libobject.get_name(): extension = mimetypes.guess_extension("application/pdf") or "" length = self.filename_length_cutoff - len(extension) displayed_filename = slugify( libobject.get_name())[:length] + extension else: displayed_filename = slugify( libobject.get_access_url())[-self.filename_length_cutoff:] file.oparl_id = libobject.get_id() file.name = libobject.get_name()[:200] # FIXME file.displayed_filename = displayed_filename file.parsed_text = libobject.get_text() file.mime_type = libobject.get_mime_type( ) or "application/octet-stream" file.legal_date = self.glib_datetime_to_python_date( libobject.get_date()) if self.download_files: self.download_file(file, libobject) else: file.storage_filename = "" file.filesize = -1 if file.storage_filename and not file.parsed_text: self.extract_text_from_file(file) file.save() file.rebuild_locations() return file
def build_party_group_geo(): groups = json.load(open('group_parties.json', 'r')) G = nx.Graph() for i, group in enumerate(groups): parties = groups[group] party_list = [] for party in parties: name = slugify.slugify(party['party']) party_list.append(name) lat = float(party['data']['lat']) lng = float(party['data']['lon']) lat, lng = scatter(lat, lng) G.add_node(name, party_name = party['party'], group_id = i, group = group, group_name = party['data']['group_name'], city = party['data']['city'], lat = lat, lng = lng) for i in range(0, len(party_list)): for j in range(i+1, len(party_list)): G.add_edge(party_list[i], party_list[j]) print 'Nodes:', len(G.nodes()) print 'Edges:', len(G.edges()) nx.write_gexf(G, './sna/party_group_geo.gexf')
def file(self, libobject: OParl.File): file, do_update = self.check_for_modification(libobject, File) if not file or not do_update: return file self.logger.info("Processing File {}".format(libobject.get_id())) if libobject.get_file_name(): displayed_filename = libobject.get_file_name() elif libobject.get_name(): extension = mimetypes.guess_extension("application/pdf") or "" length = self.filename_length_cutoff - len(extension) displayed_filename = slugify( libobject.get_name())[:length] + extension else: displayed_filename = slugify( libobject.get_access_url())[-self.filename_length_cutoff:] parsed_text_before = file.parsed_text file_name_before = file.name file.oparl_id = libobject.get_id() file.name = libobject.get_name() file.displayed_filename = displayed_filename file.mime_type = libobject.get_mime_type( ) or "application/octet-stream" file.legal_date = self.glib_datetime_to_python_date( libobject.get_date()) file.sort_date = file.created file.oparl_access_url = libobject.get_access_url() file.oparl_download_url = libobject.get_download_url() # If no text comes from the API, don't overwrite previously extracted PDF-content with an empty string if libobject.get_text(): file.parsed_text = libobject.get_text() if self.download_files: self.download_file(file, libobject) else: file.storage_filename = "" file.filesize = -1 parsed_text = file.parsed_text if file.storage_filename and not file.parsed_text: parsed_text = self.extract_text_from_file(file) file = self.call_custom_hook("sanitize_file", file) if len(file.name) > 200: file.name = textwrap.wrap(file.name, 199)[0] + "\u2026" file.save() if file_name_before != file.name or parsed_text_before != file.parsed_text: # These two operations are rather CPU-intensive, so we only perform them if something relevant has changed file.locations = extract_locations(parsed_text) file.mentioned_persons = extract_persons(file.name + "\n" + (parsed_text or "") + "\n") file.save() return file
def validate_variations(self, asset_type, variations): """ Validate and the given map of variations (if valid the variations are returned. """ # Check the structure of the variations is valid if not isinstance(variations, dict): raise APIError('invalid_request', hint='Request body JSON must be an object.') if len(variations) == 0: raise APIError('invalid_request', hint='At least one variation is required.') elif len(variations) > self.config['MAX_VARIATIONS_PER_REQUEST']: raise APIError( 'invalid_request', hint=('The maximum number of variations that can be added in ' 'single request is ' f"{self.config['MAX_VARIATIONS_PER_REQUEST']}.")) for name, transforms in variations.items(): # Check the name of the variation is valid slug = slugify( name, regex_pattern=ALLOWED_SLUGIFY_CHARACTERS, ) # Unlike slugify we allow dashes at the start/end of the variation # name, so we strip dashes before the test. if slug != name.strip('-'): raise APIError('invalid_request', hint=f'Not a valid variation name: {name}.') # Check the required number of transforms have been provided if len(transforms) == 0: raise APIError( 'invalid_request', hint=('At least one transform per variation is required: ' f'{name}.')) for i, transform in enumerate(transforms): # Check transform structure if not (len(transform) == 2 and isinstance(transform[0], str) and isinstance(transform[1], dict)): raise APIError( 'invalid_request', hint=(f'Invalid transform structure: {transform} ' f'({name}).')) # Check the transform exists transform_cls = get_transform(asset_type, transform[0]) if not transform_cls: raise APIError( 'invalid_request', hint=( f'Unknown transform: {asset_type}:{transform[0]} ' f'({name}).')) # Check only the last transform in the list is flagged as a # final transform. if transform_cls.final and i < len(transforms) - 1: raise APIError( 'invalid_request', hint=('Final transform not set as last transform: ' f'{asset_type}:{transform[0]} ({name}).')) if not transform_cls.final and i == len(transforms) - 1: raise APIError( 'invalid_request', hint=(f'Last transform in list is not final: {name}')) # Check the settings for the transform are correct settings_form = transform_cls.get_settings_form_cls()( MultiDict({ k: v for k, v in transform[1].items() if v is not None })) if not settings_form.validate(): raise APIError( 'invalid_request', hint=('Invalid settings for transform: ' f'{asset_type}:{transform[0]} ({name}).'), arg_errors=settings_form.errors) return variations
async def put(self): """Store the uploaded file as an asset""" # Make sure a file was received files = self.request.files.get('file') if not files: raise APIError( 'invalid_request', arg_errors={'file': ['No file received.']} ) file = files[0] # Validate the arguments form = PutForm(to_multi_dict(self.request.body_arguments)) if not form.validate(): raise APIError( 'invalid_request', arg_errors=form.errors ) if self.config['ANTI_VIRUS_ENABLED']: # Check the file for viruses av_client = clamd.ClamdUnixSocket( self.config['ANTI_VIRUS_CLAMD_PATH'] ) av_scan_result = av_client.instream(io.BytesIO(file.body)) if av_scan_result['stream'][0] == 'FOUND': raise APIError( 'invalid_request', arg_errors={ 'file': ['File appears to be a virus.'] } ) form_data = form.data # Create a name for the asset fname, fext = os.path.splitext(file.filename) name = slugify( form_data['name'] or fname, regex_pattern=ALLOWED_SLUGIFY_CHARACTERS, max_length=200 ) # Determine the files extension ext = fext[1:] if fext else imghdr.what(file.filename, file.body) # Determine the asset type/content type for the image content_type = mimetypes.guess_type(f'f.{ext}')[0] \ or 'application/octet-stream' asset_type = self.config['CONTENT_TYPE_TO_TYPES'].get( content_type, 'file' ) # Build the meta data for the asset meta = { 'filename': file.filename, 'length': len(file.body) } if asset_type == 'audio': try: au_file = io.BytesIO(file.body) au_file.name = file.filename au = mutagen.File(au_file) except: raise APIError( 'invalid_request', arg_errors={ 'file': ['Unable to open the file as an audio file.'] } ) if au is not None: meta['audio'] = { 'channels': getattr(au.info, 'channels', -1), 'length': getattr(au.info, 'length', -1), 'mode': { 0: 'stereo', 1: 'joint_stereo', 2: 'dual_channel', 3: 'mono' }.get(getattr(au.info, 'mode', ''), ''), 'sample_rate': getattr(au.info, 'sample_rate', -1) } if asset_type == 'image': im = None try: im = Image.open(io.BytesIO(file.body)) meta['image'] = { 'mode': im.mode, 'size': im.size } except: raise APIError( 'invalid_request', arg_errors={ 'file': ['Unable to open the file as an image.'] } ) finally: if im: im.close() # Create the asset asset = Asset( uid=Asset.generate_uid(), account=self.account, secure=form_data['secure'], name=name, ext=ext, type=asset_type, content_type=content_type, expires=(time.time() + form_data['expire']) if form_data['expire'] else None, meta=meta ) # Store the file backend = self.get_backend(asset.secure) await backend.async_store( io.BytesIO(file.body), asset.store_key, loop=asyncio.get_event_loop() ) # Save the asset asset.insert() # Update the asset stats Stats.inc( self.account, today_tz(tz=self.config['TIMEZONE']), { 'assets': 1, 'length': asset.meta['length'] } ) self.write(asset.to_json_type())
def on_changed_name(target, value, oldvalue, initiator): target.slug = slugify(value)
def on_changed_title(target, value, oldvalue, initiator): if value and (not target.slug or value != oldvalue): target.slug = slugify(value)