def retrieve_transcript(self, **kwargs): encoding = kwargs.get('encoding') or speech.enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED sample_rate = kwargs.get('encoding') or 44100 language_code = kwargs.get('language_code') or self.language automatic_punctuation = kwargs.get('automatic_punctuation') or True client = speech.SpeechClient() with io.open(self._save_path, 'rb') as f: content = f.read() audio = speech.types.RecognitionAudio(content=content) config = speech.types.RecognitionConfig(encoding=encoding, enable_automatic_punctuation=automatic_punctuation, sample_rate_hertz=sample_rate, language_code=language_code) result = client.recognize(config, audio) transcript = '' confidence = 0 for r in result.results: confidence = r.alternatives[0].confidence transcript += r.alternatives[0].transcript self.confidence = confidence self.transcript = transcript db.commit() return transcript, confidence
def import_article(): data = request.get_json() logger.info('data: %s', data) source_url = data.get('source_url') pretty_name = data.get('pretty_name') language = data.get('language') rss_id = data.get('id') track = RSSTrack.get(rss_id) try: resp = requests.get(track.url) resp.raise_for_status() except HTTPError as e: return {'status': 'error', 'details': str(e)}, resp.status_code hostname = urlparse(track.url).hostname article = Article(pretty_name=track.name, url=track.url, rss_id=rss_id, language=language) parser = ParsingRules.get(hostname) # TODO: Save in a better format! content = parser.parse(resp.content) article.content = content db.add(article) db.commit() return {'status': 'ok', 'article': article.to_json()}, 200
def split_file(self, seconds=DEFAULT_SEGMENT_LENGTH): if not seconds: seconds = DEFAULT_SEGMENT_LENGTH # logger.info('Splitting file at %s', path) # split_path = os.path.split(path) # output_path = "/".join([split_path[0], f'{seconds}_{split_path[1].rstrip(self.extension)}%04d.mp3']) path = os.path.dirname(self._save_path()) output_dir = os.path.join(path, str(seconds)) if not os.path.exists(output_dir): os.makedirs(output_dir) else: # The base has already been split return ffmpeg_path = os.path.join(output_dir, f'%04d_{self.filename}') # TODO: Choice between different tools? args = [ 'ffmpeg', '-i', self._save_path(), '-f', 'segment', '-segment_time', str(seconds), '-c', 'copy', ffmpeg_path ] self.ffmpeg(args) logger.debug(f'listdir: {output_dir}') segments = [s for s in sorted(os.listdir(output_dir))] logger.debug('segments: %r' % segments) for idx, f in enumerate(segments): logger.debug(f'{idx} {f}') logger.debug(f'SAVING {idx}, {f}') s = Segment(base_id=self.id, position=idx + 1, length=seconds, language=self.language) db.add(s) db.commit()
def add_channel(): data = request.get_json() try: feed_url = data.get('feed_url') resp = requests.head(feed_url) resp.raise_for_status() except HTTPError: return { 'status': 'error', 'details': f'Failed to connect to {feed_url} with status code {resp.status_code}' }, resp.status_code except ConnectionError: return { 'status': 'error', 'details': f'Server for url {feed_url} is not available' }, 503 channel = RSSChannel(url=feed_url, channel_name=data.get('feed_name'), channel_description=data.get('feed_description'), channel_type=data.get('feed_type')) db.add(channel) db.commit() return {'status': 'ok', 'data': channel.to_json()}, 200
def create_user(): init_db() # u = db.query(User).first() # logger.debug('User exists? %r' % u) # if not u: # from flask_security.utils import hash_password # # user_datastore.create_user(username='******', # email=os.environ.get('DEFAULT_EMAIL'), # password=hash_password(os.environ.get('DEFAULT_PASSWORD')), # seconds_available=600) db.commit()
def translate_segment(file, position): # NOTE: This does not necessarily translate a whole segment, merely a part. # To translate a whole part, we may end up translating some bits twice # so we'll have to do something a bit more clever data = request.get_json(force=True) target = data.get('target', 'en-GB') text = data.get('text') logger.debug(f'translate file {file} position {position} text {text}') segment = BaseAudio.get(file).get_segment(position) logger.debug(f'translate segment: {segment.to_json()}') translation = _translate(text, target, segment) db.add(translation) db.commit() ret = {'status': 'ok', 'translation': translation.to_json(segment)} return ret, 200
def save_file(): """Downloads the file from the URL specified in POST/PUT body and saves on the filesystem and creates a record in the database Required fields: source_url: The URL to download it from (must be open, no auth will be attempted) Optional fields: language: BCP 47 language code. See https://cloud.google.com/speech-to-text/docs/languages for supported languages Files with more than one language should use string "MULTI" pretty_name: The name that the file will be displayed as :return: A JSON representation of the file or error message as appropriate """ try: data = request.get_json() source_url = data.get('source_url') pretty_name = data.get('pretty_name') language = data.get('language') # RSS page field track_id = data.get('track_id') audio = BaseAudio.get(source_url=source_url) if BaseAudio.exists(audio): logger.info('Audio file already exists') return audio.to_json(), 409 audio = BaseAudio(source_url=source_url, pretty_name=pretty_name, language=language) db.add(audio) db.flush() audio.save_file() db.commit() if track_id: track = db.query(RSSTrack).filter(RSSTrack.id == track_id).first() if track: track.is_added = True db.commit() except Exception as e: data = {'status': 'error', 'message': str(e)} return data, 500 return audio.to_json(), 201
def parse_feed(): """Checks the content of an RSS URL and adds the episodes to the website :return: """ data = request.get_json() feed_url = data.get('url') req = requests.get(feed_url) soup = BeautifulSoup(req.content, 'xml') channel = db.query(RSSChannel).filter(RSSChannel.url == feed_url).first() if not channel: name = soup.find('title').text description = soup.find('description').text channel = RSSChannel(url=feed_url, channel_name=name, channel_description=description) db.add(channel) db.commit() latest_track = db.query(RSSTrack)\ .filter(RSSTrack.channel == channel)\ .order_by(RSSTrack.published_date.desc())\ .first() tracks = soup.find_all('item') ret = [] for item in tracks: try: date_text = item.find('pubDate').text except AttributeError: date_text = item.find('pubdate').text rfc_date = datetime.datetime.fromtimestamp( email.utils.mktime_tz(email.utils.parsedate_tz(date_text)), pytz.utc) if latest_track: latest_track.published_date = latest_track.published_date.replace( tzinfo=rfc_date.tzinfo) if latest_track.published_date >= rfc_date: logger.debug( f'Reached {latest_track.name}, which is already in the database' ) break if channel.channel_type == 'audio': track_url = item.find('enclosure').get('url') elif channel.channel_type == 'text': track_url = item.find('link').text else: raise ValueError('Unrecognised channel type') name = item.find('title').text description = item.find('description').text pub_date = datetime.datetime.fromtimestamp( email.utils.mktime_tz(email.utils.parsedate_tz(date_text)), pytz.utc) track = RSSTrack(channel_id=channel.id, url=track_url, name=name, description=description, published_date=pub_date) ret.append(track) db.add(track) db.commit() return {'status': 'OK', 'data': [track.to_json() for track in ret]}, 200