Python decode_object_from_bytes_if_needed Exemples, mediawords.util.perl.decode_object_from_bytes_if_needed Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : handler.py Projet : berkmancenter/mediacloud

    def find_or_create(self, table: str, insert_hash: dict) -> Dict[str, Any]:
        """Select a single row from the database matching the hash or insert a row with the hash values and return the
        inserted row as a hash."""

        # FIXME probably do this in a serialized transaction?

        table = decode_object_from_bytes_if_needed(table)
        insert_hash = decode_object_from_bytes_if_needed(insert_hash)

        insert_hash = insert_hash.copy()  # To be able to safely modify it

        if len(insert_hash) == 0:
            raise McFindOrCreateException("Hash to INSERT or SELECT is empty")

        # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst
        if "submit" in insert_hash:
            del insert_hash["submit"]

        row = self.select(table=table, what_to_select='*', condition_hash=insert_hash)
        if row is not None and row.rows() > 0:
            return row.hash()
        else:
            # try to create it, but if some other process has created it because we don't have a lock, just use that one
            try:
                return self.create(table=table, insert_hash=insert_hash)
            except McUniqueConstraintException:
                return self.select(table=table, what_to_select='*', condition_hash=insert_hash).hash()

Exemple #2

0

Afficher le fichier

Fichier : extractor_arguments.py Projet : berkmancenter/mediacloud

    def __init__(self,
                 no_dedup_sentences: bool = False,
                 no_delete: bool = False,
                 no_tag_extractor_version: bool = False,
                 use_cache: bool = False,
                 use_existing: bool = False):
        """Constructor."""

        if isinstance(no_dedup_sentences, bytes):
            no_dedup_sentences = decode_object_from_bytes_if_needed(no_dedup_sentences)
        if isinstance(no_delete, bytes):
            no_delete = decode_object_from_bytes_if_needed(no_delete)
        if isinstance(no_tag_extractor_version, bytes):
            no_tag_extractor_version = decode_object_from_bytes_if_needed(no_tag_extractor_version)
        if isinstance(use_cache, bytes):
            use_cache = decode_object_from_bytes_if_needed(use_cache)
        if isinstance(use_existing, bytes):
            use_existing = decode_object_from_bytes_if_needed(use_existing)

        # MC_REWRITE_TO_PYTHON: remove weird casts after Python rewrite
        no_dedup_sentences = bool(int(no_dedup_sentences))
        no_delete = bool(int(no_delete))
        no_tag_extractor_version = bool(int(no_tag_extractor_version))
        use_cache = bool(int(use_cache))
        use_existing = bool(int(use_existing))

        self.__no_dedup_sentences = no_dedup_sentences
        self.__no_delete = no_delete
        self.__no_tag_extractor_version = no_tag_extractor_version
        self.__use_cache = use_cache
        self.__use_existing = use_existing

Exemple #3

0

Afficher le fichier

Fichier : download_texts.py Projet : berkmancenter/mediacloud

def create(db: DatabaseHandler, download: dict, extract: dict) -> dict:
    """Create a download_text hash and insert it into the database. Delete any existing download_text row for the
    download."""

    # FIXME don't pass freeform "extract" dict, we need just the "extracted_text"

    download = decode_object_from_bytes_if_needed(download)
    extract = decode_object_from_bytes_if_needed(extract)

    db.query("""
        DELETE FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']})

    download_text = db.query("""
        INSERT INTO download_texts (downloads_id, download_text, download_text_length)
        VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s))
        RETURNING *
    """, {
        'downloads_id': download['downloads_id'],
        'download_text': extract['extracted_text'],
    }).hash()

    db.query("""
        UPDATE downloads
        SET extracted = 't'
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']})

    return download_text

Exemple #4

0

Afficher le fichier

Fichier : user.py Projet : berkmancenter/mediacloud

    def __init__(self,
                 email: str,
                 full_name: str = None,
                 notes: str = None,
                 active: bool = None,
                 weekly_requests_limit: int = None,
                 weekly_requested_items_limit: int = None,
                 password: str = None,
                 password_repeat: str = None,
                 role_ids: List[int] = None):
        super().__init__(
            email=email,
            full_name=full_name,
            notes=notes,
            active=active,
            weekly_requests_limit=weekly_requests_limit,
            weekly_requested_items_limit=weekly_requested_items_limit
        )

        password = decode_object_from_bytes_if_needed(password)
        password_repeat = decode_object_from_bytes_if_needed(password_repeat)

        if password is not None and password_repeat is not None:
            password_validation_message = validate_new_password(
                email=self.email(),
                password=password,
                password_repeat=password_repeat
            )
            if password_validation_message:
                raise McAuthUserException("Password is invalid: %s" % password_validation_message)

        self.__password = password
        self.__password_repeat = password_repeat
        self.__role_ids = role_ids

Exemple #5

0

Afficher le fichier

Fichier : stories.py Projet : berkmancenter/mediacloud

def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]:
    """If the story is new, add story to the database with the feed of the download as story feed.

    Returns created story or None if story wasn't created.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)
    if isinstance(skip_checking_if_new, bytes):
        skip_checking_if_new = decode_object_from_bytes_if_needed(skip_checking_if_new)
    skip_checking_if_new = bool(int(skip_checking_if_new))

    if db.in_transaction():
        raise McAddStoryException("add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    if not skip_checking_if_new:
        if not is_new(db=db, story=story):
            log.debug("Story '{}' is not new.".format(story['url']))
            db.commit()
            return None

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".format(story['url'], story['guid'])
            )
            return None

        else:
            raise McAddStoryException("Error adding story: {}\nStory: {}".format(str(ex), str(story)))

    db.find_or_create(
        table='feeds_stories_map',
        insert_hash={
            'stories_id': story['stories_id'],
            'feeds_id': feeds_id,
        }
    )

    db.commit()

    return story

Exemple #6

0

Afficher le fichier

Fichier : handler.py Projet : berkmancenter/mediacloud

    def __init__(self,
                 host: str,
                 port: int,
                 username: str,
                 password: str,
                 database: str,
                 do_not_check_schema_version: bool = False):
        """Database handler constructor; connects to PostgreSQL too."""

        host = decode_object_from_bytes_if_needed(host)
        # noinspection PyTypeChecker
        port = int(decode_object_from_bytes_if_needed(port))
        username = decode_object_from_bytes_if_needed(username)
        password = decode_object_from_bytes_if_needed(password)
        database = decode_object_from_bytes_if_needed(database)

        self.__primary_key_columns = {}
        self.__schema_version_check_pids = {}
        self.__print_warnings = True
        self.__in_manual_transaction = False
        self.__conn = None
        self.__db = None

        self.__connect(
            host=host,
            port=port,
            username=username,
            password=password,
            database=database,
            do_not_check_schema_version=do_not_check_schema_version
        )

Exemple #7

0

Afficher le fichier

Fichier : extract_story_links_job.py Projet : berkmancenter/mediacloud

    def run_job(cls, stories_id: int, topics_id: int) -> None:
        """Run the extract_story_links job, using mediawords.tm.extract_story_links for the logic."""
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)
        if stories_id is None:
            raise McExtractStoryLinksJobException("'stories_id' is None.")

        if isinstance(topics_id, bytes):
            topics_id = decode_object_from_bytes_if_needed(topics_id)
        if topics_id is None:
            raise McExtractStoryLinksJobException("'topics_id' is None.")

        stories_id = int(stories_id)
        topics_id = int(topics_id)

        log.info("Start fetching extracting links for stories_id %d topics_id %d" % (stories_id, topics_id))

        try:
            db = connect_to_db()
            story = db.require_by_id(table='stories', object_id=stories_id)
            topic = db.require_by_id(table='topics', object_id=topics_id)
            mediawords.tm.extract_story_links.extract_links_for_topic_story(db, story, topic)

        except Exception as ex:
            log.error("Error while processing story {}: {}".format(stories_id, ex))
            raise McExtractStoryLinksJobException(
                "Unable to process story {}: {}".format(stories_id, traceback.format_exc())
            )

        log.info("Finished fetching extracting links for stories_id %d topics_id %d" % (stories_id, topics_id))

Exemple #8

0

Afficher le fichier

Fichier : reset_password.py Projet : berkmancenter/mediacloud

def send_password_reset_token(db: DatabaseHandler, email: str, password_reset_link: str) -> None:
    """Prepare for password reset by emailing the password reset token."""

    email = decode_object_from_bytes_if_needed(email)
    password_reset_link = decode_object_from_bytes_if_needed(password_reset_link)

    # Check if user exists
    try:
        user = user_info(db=db, email=email)
        full_name = user.full_name()

    except Exception as ex:
        log.warning("Unable to fetch user profile for user '%s': %s" % (email, str(ex),))
        full_name = 'Nonexistent user'

    # If user was not found, send an email to a random address anyway to avoid timing attack
    full_password_reset_link = _generate_password_reset_token(
        db=db,
        email=email,
        password_reset_link=password_reset_link,
    )
    if not full_password_reset_link:
        log.warning("Unable to generate full password reset link for email '%s'" % email)
        email = '*****@*****.**'
        full_password_reset_link = 'password reset link'

    message = AuthResetPasswordMessage(to=email, full_name=full_name, password_reset_url=full_password_reset_link)
    if not send_email(message):
        raise McAuthResetPasswordException('Unable to send password reset email.')

Exemple #9

0

Afficher le fichier

Fichier : password.py Projet : berkmancenter/mediacloud

def password_reset_token_is_valid(db: DatabaseHandler, email: str, password_reset_token: str) -> bool:
    """Validate password reset token (used for both user activation and password reset)."""
    email = decode_object_from_bytes_if_needed(email)
    password_reset_token = decode_object_from_bytes_if_needed(password_reset_token)

    if not (email and password_reset_token):
        log.error("Email and / or password reset token is empty.")
        return False

    # Fetch readonly information about the user
    password_reset_token_hash = db.query("""
        SELECT auth_users_id,
               email,
               password_reset_token_hash
        FROM auth_users
        WHERE email = %(email)s
        LIMIT 1
    """, {'email': email}).hash()
    if password_reset_token_hash is None or 'auth_users_id' not in password_reset_token_hash:
        log.error("Unable to find user %s in the database." % email)
        return False

    password_reset_token_hash = password_reset_token_hash['password_reset_token_hash']

    if password_hash_is_valid(password_hash=password_reset_token_hash, password=password_reset_token):
        return True
    else:
        return False

Exemple #10

0

Afficher le fichier

Fichier : password.py Projet : berkmancenter/mediacloud

def validate_new_password(email: str, password: str, password_repeat: str) -> str:
    """Check if password complies with strength the requirements.

    Returns empty string on valid password, error message on invalid password."""

    email = decode_object_from_bytes_if_needed(email)
    password = decode_object_from_bytes_if_needed(password)
    password_repeat = decode_object_from_bytes_if_needed(password_repeat)

    if not email:
        return 'Email address is empty.'

    if not (password and password_repeat):
        return 'To set the password, please repeat the new password twice.'

    if password != password_repeat:
        return 'Passwords do not match.'

    if len(password) < __MIN_PASSWORD_LENGTH or len(password) > __MAX_PASSWORD_LENGTH:
        return 'Password must be between %d and %d characters length.' % (__MIN_PASSWORD_LENGTH, __MAX_PASSWORD_LENGTH,)

    if password == email:
        return "New password is your email address; don't cheat!"

    return ''

Exemple #11

0

Afficher le fichier

Fichier : compress.py Projet : berkmancenter/mediacloud

def extract_tarball_to_directory(archive_file: str, dest_directory: str, strip_root: bool = False) -> None:
    """Extract Tar archive (.tar, .tar.gz or .tgz) to destination directory, optionally stripping the root directory
    first."""

    archive_file = decode_object_from_bytes_if_needed(archive_file)
    dest_directory = decode_object_from_bytes_if_needed(dest_directory)

    if not os.path.isfile(archive_file):
        raise McExtractTarballToDirectoryException("Archive at '%s' does not exist" % archive_file)

    archive_file_extension = file_extension(archive_file)
    if archive_file_extension in [".gz", ".tgz"]:
        tar_args = "-zxf"
    elif archive_file_extension in [".tar"]:
        tar_args = "-xf"
    else:
        raise McExtractTarballToDirectoryException("Unsupported archive '%s' with extension '%s'" %
                                                   (archive_file, archive_file_extension))

    args = ["tar",
            tar_args, archive_file,
            "-C", dest_directory]
    if strip_root:
        args += ['--strip', '1']

    try:
        run_command_in_foreground(args)
    except McRunCommandInForegroundException as ex:
        raise McExtractTarballToDirectoryException("Error while extracting archive '%s': %s" % (archive_file, str(ex)))

Exemple #12

0

Afficher le fichier

Fichier : parse_html.py Projet : berkmancenter/mediacloud

def link_canonical_url_from_html(html: str, base_url: Optional[str] = None) -> Optional[str]:
    """From the provided HTML, determine the <link rel="canonical" /> URL (if any)."""
    html = str(decode_object_from_bytes_if_needed(html))

    base_url_decode = decode_object_from_bytes_if_needed(base_url)
    base_url = None if base_url_decode is None else str(base_url_decode)

    link_elements = re.findall(r'(<\s*?link.+?>)', html, re.I)
    for link_element in link_elements:
        if re.search(r'rel\s*?=\s*?["\']\s*?canonical\s*?["\']', link_element, re.I):
            match = re.search(r'href\s*?=\s*?["\'](.+?)["\']', link_element, re.I)
            if match:
                url = match.group(1)
                if not is_http_url(url):
                    # Maybe it's absolute path?
                    if base_url is not None:
                        return urljoin(base=base_url, url=url)
                    else:
                        log.debug(
                            "HTML <link rel='canonical'/> found, but the new URL '%s' doesn't seem to be valid." % url
                        )
                else:
                    # Looks like URL, so return it
                    return url
    return None

Exemple #13

0

Afficher le fichier

Fichier : handler.py Projet : berkmancenter/mediacloud

    def select(self, table: str, what_to_select: str, condition_hash: dict = None) -> DatabaseResult:
        """SELECT chosen columns from the table that match given conditions."""

        table = decode_object_from_bytes_if_needed(table)
        what_to_select = decode_object_from_bytes_if_needed(what_to_select)
        condition_hash = decode_object_from_bytes_if_needed(condition_hash)

        if condition_hash is None:
            condition_hash = {}

        condition_hash = condition_hash.copy()  # To be able to safely modify it

        # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst
        if "submit" in condition_hash:
            del condition_hash["submit"]

        sql_conditions = []

        for key, value in condition_hash.items():
            condition = key
            condition += " = %(" + key + ")s"  # "%(key)s" to be resolved by psycopg2, not Python
            sql_conditions.append(condition)

            # Cast Inline::Python's booleans to Python's booleans
            # MC_REWRITE_TO_PYTHON: remove after porting
            if type(value).__name__ == '_perl_obj':
                value = bool(value)
                condition_hash[key] = value

        sql = "SELECT %s " % what_to_select
        sql += "FROM %s " % table
        if len(sql_conditions) > 0:
            sql += "WHERE %s" % " AND ".join(sql_conditions)

        return self.query(sql, condition_hash)

Exemple #14

0

Afficher le fichier

Fichier : create.py Projet : berkmancenter/mediacloud

def create_test_story(db: DatabaseHandler, label: str, feed: dict) -> dict:
    """Create test story with a simple label belonging to feed."""

    label = decode_object_from_bytes_if_needed(label)
    feed = decode_object_from_bytes_if_needed(feed)

    story = db.create(
        table='stories',
        insert_hash={
            'media_id': int(feed['media_id']),
            'url': "http://story.test/%s" % label,
            'guid': "guid://story.test/%s" % label,
            'title': "story %s" % label,
            'description': "description %s" % label,
            'publish_date': '2016-10-15 08:00:00',
            'collect_date': '2016-10-15 10:00:00',
            'full_text_rss': True,
        }
    )

    db.create(
        table='feeds_stories_map',
        insert_hash={
            'feeds_id': int(feed['feeds_id']),
            'stories_id': int(story['stories_id']),
        }
    )

    return story

Exemple #15

0

Afficher le fichier

Fichier : handler.py Projet : berkmancenter/mediacloud

    def find_by_id(self, table: str, object_id: int) -> Union[Dict[str, Any], None]:
        """Do an ID lookup on the table and return a single row match if found."""

        # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst
        # noinspection PyTypeChecker
        object_id = decode_object_from_bytes_if_needed(object_id)
        object_id = int(object_id)

        table = decode_object_from_bytes_if_needed(table)

        primary_key_column = self.primary_key_column(table)
        if not primary_key_column:
            raise McFindByIDException("Primary key for table '%s' was not found" % table)

        # Python substitution
        find_by_id_query = "SELECT * FROM %(table)s WHERE %(id_column)s" % {
            "table": table,
            "id_column": primary_key_column,
        }

        # psycopg2 substitution
        result = self.query(find_by_id_query + " = %(id_value)s", {'id_value': object_id})
        if result.rows() > 1:
            raise McFindByIDException("More than one row was found for ID '%d' from table '%s'" % (object_id, table))
        elif result.rows() == 1:
            return result.hash()
        else:
            return None

Exemple #16

0

Afficher le fichier

Fichier : stories.py Projet : berkmancenter/mediacloud

def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None:
    """Create a pending download for the story's URL."""
    story = decode_object_from_bytes_if_needed(story)
    parent_download = decode_object_from_bytes_if_needed(parent_download)

    download = {
        'feeds_id': parent_download['feeds_id'],
        'stories_id': story['stories_id'],
        'parent': parent_download['downloads_id'],
        'url': story['url'],
        'host': get_url_host(story['url']),
        'type': 'content',
        'sequence': 1,
        'state': 'pending',
        'priority': parent_download['priority'],
        'extracted': False,
    }

    content_delay = db.query("""
        SELECT content_delay
        FROM media
        WHERE media_id = %(media_id)s
    """, {'media_id': story['media_id']}).flat()[0]
    if content_delay:
        # Delay download of content this many hours. his is useful for sources that are likely to significantly change
        # content in the hours after it is first published.
        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
        download_at_timestamp = now + (content_delay * 60 * 60)
        download['download_time'] = get_sql_date_from_epoch(download_at_timestamp)

    db.create(table='downloads', insert_hash=download)

Exemple #17

0

Afficher le fichier

Fichier : data.py Projet : berkmancenter/mediacloud

def store_test_data_to_individual_files(basename: str, data: dict) -> None:
    """Write the given data to disk under the given basename; split the data (list) into individual files."""
    basename = decode_object_from_bytes_if_needed(basename)
    data = decode_object_from_bytes_if_needed(data)

    data_dict = {}
    for story in data:
        stories_id = story.get('stories_id', None)
        if not stories_id:
            raise McStoreTestDataToIndividualFilesException("Story ID is unset for story: {}".format(story))

        if stories_id in data_dict:
            raise McStoreTestDataToIndividualFilesException(
                "Story ID is not unique (such story already exists in a dict) for story: {}".format(story)
            )

        data_dict[stories_id] = story

    # Remove all files before overwriting them (in case the new unit test contains *less* stories, we don't want old
    # files lying around)
    old_data_files = __test_data_files(basename=basename)
    log.info("Will remove old data files at path '{}': {}".format(basename, old_data_files))
    for path in old_data_files:
        os.unlink(path)

    # Write dict to files
    for index in data_dict.keys():
        store_test_data(basename=str(index), data=data_dict[index], subdirectory=basename)

Exemple #18

0

Afficher le fichier

Fichier : locks.py Projet : berkmancenter/mediacloud

def get_session_lock(db: mediawords.db.DatabaseHandler, lock_type: str, lock_id: int, wait: bool = False) -> bool:
    """Get a postgres advisory lock with the lock_type and lock_id as the two keys.

    Arguments:
    db - db handle
    lock_type - must be in LOCK_TYPES dict above
    lock_id - id for the particular lock within the type
    wait - if true, block while waiting for the lock, else return false if the lock is not available

    Returns:
    True if the lock is available
    """
    lock_type = str(decode_object_from_bytes_if_needed(lock_type))

    if isinstance(lock_id, bytes):
        lock_id = decode_object_from_bytes_if_needed(lock_id)
    lock_id = int(lock_id)

    if isinstance(wait, bytes):
        wait = decode_object_from_bytes_if_needed(wait)
    wait = bool(wait)

    log.debug("trying for lock: %s, %d" % (lock_type, lock_id))

    if lock_type not in LOCK_TYPES:
        raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type)

    lock_type_id = LOCK_TYPES[lock_type]

    if wait:
        db.query("select pg_advisory_lock(%(a)s, %(b)s)", {'a': lock_type_id, 'b': lock_id})
        return True
    else:
        r = db.query("select pg_try_advisory_lock(%(a)s, %(b)s) as locked", {'a': lock_type_id, 'b': lock_id}).hash()
        return r['locked']

Exemple #19

0

Afficher le fichier

Fichier : downloads.py Projet : berkmancenter/mediacloud

def store_content(db: DatabaseHandler, download: dict, content: str) -> dict:
    """Store the content for the download."""
    # feed_error state indicates that the download was successful but that there was a problem
    # parsing the feed afterward.  so we want to keep the feed_error state even if we redownload
    # the content

    download = decode_object_from_bytes_if_needed(download)
    content = decode_object_from_bytes_if_needed(content)

    new_state = 'success' if download['state'] != 'feed_error' else 'feed_error'

    try:
        path = _get_store_for_writing().store_content(db, download['downloads_id'], content)
    except Exception as ex:
        raise McDBIDownloadsException("error while trying to store download %d: %s" % (download['downloads_id'], ex))

    if new_state == 'success':
        download['error_message'] = ''

    db.update_by_id(
        table='downloads',
        object_id=download['downloads_id'],
        update_hash={'state': new_state, 'path': path, 'error_message': download['error_message']},
    )

    download = db.find_by_id('downloads', download['downloads_id'])

    return download

Exemple #20

0

Afficher le fichier

Fichier : downloads.py Projet : berkmancenter/mediacloud

def _set_extractor_results_cache(db, download: dict, results: dict) -> None:
    """Store results in extractor cache and manage size of cache."""

    # This cache is used as a backhanded way of extracting stories asynchronously in the topic spider.  Instead of
    # submitting extractor jobs and then directly checking whether a given story has been extracted, we just
    # throw extraction jobs in chunks into the extractor job and cache the results.  Then if we re-extract
    # the same story shortly after, this cache will hit and the cost will be trivial.

    download = decode_object_from_bytes_if_needed(download)
    results = decode_object_from_bytes_if_needed(results)

    # Upsert cache entry
    db.query("""
        INSERT INTO cache.extractor_results_cache (
            extracted_html,
            extracted_text,
            downloads_id
        ) VALUES (
            %(extracted_html)s,
            %(extracted_text)s,
            %(downloads_id)s
        ) ON CONFLICT (downloads_id) DO UPDATE SET
            extracted_html = EXCLUDED.extracted_html,
            extracted_text = EXCLUDED.extracted_text
    """, {
        'extracted_html': results['extracted_html'],
        'extracted_text': results['extracted_text'],
        'downloads_id': int(download['downloads_id']),
    })

Exemple #21

0

Afficher le fichier

Fichier : data.py Projet : berkmancenter/mediacloud

def fetch_test_data(basename: str, subdirectory: str = '') -> dict:
    """Fetch the given data from disk."""

    basename = decode_object_from_bytes_if_needed(basename)
    subdirectory = decode_object_from_bytes_if_needed(subdirectory)

    file_path = _get_data_file(basename=basename, subdirectory=subdirectory)
    with open(file_path, mode='r', encoding='utf-8') as f:
        return decode_json(f.read())

Exemple #22

0

Afficher le fichier

Fichier : user.py Projet : berkmancenter/mediacloud

    def __init__(self, api_key: str, ip_address: str = None):
        api_key = decode_object_from_bytes_if_needed(api_key)
        ip_address = decode_object_from_bytes_if_needed(ip_address)

        if not api_key:
            raise McAuthUserException("API key is unset.")

        self.__api_key = api_key
        self.__ip_address = ip_address

Exemple #23

0

Afficher le fichier

Fichier : ap.py Projet : berkmancenter/mediacloud

def _get_all_string_match_positions(haystack: str, needle: str) -> List[int]:
    haystack = decode_object_from_bytes_if_needed(haystack)
    needle = decode_object_from_bytes_if_needed(needle)

    positions = []

    for match in re.finditer(pattern=needle, string=haystack):
        positions.append(match.start())

    return positions

Exemple #24

0

Afficher le fichier

Fichier : data.py Projet : berkmancenter/mediacloud

def store_test_data(basename: str, data: dict, subdirectory: str = '') -> None:
    """Write the given data to disk under the given basename."""

    basename = decode_object_from_bytes_if_needed(basename)
    data = decode_object_from_bytes_if_needed(data)
    subdirectory = decode_object_from_bytes_if_needed(subdirectory)

    file_path = _get_data_file(basename=basename, subdirectory=subdirectory)
    with open(file_path, mode='w', encoding='utf-8') as f:
        f.write(encode_json(json_obj=data, pretty=True))

Exemple #25

0

Afficher le fichier

Fichier : change_password.py Projet : berkmancenter/mediacloud

def change_password(db: DatabaseHandler,
                    email: str,
                    new_password: str,
                    new_password_repeat: str,
                    do_not_inform_via_email: bool = False) -> None:
    """Change user's password."""

    email = decode_object_from_bytes_if_needed(email)
    new_password = decode_object_from_bytes_if_needed(new_password)
    new_password_repeat = decode_object_from_bytes_if_needed(new_password_repeat)

    if isinstance(do_not_inform_via_email, bytes):
        do_not_inform_via_email = decode_object_from_bytes_if_needed(do_not_inform_via_email)

    do_not_inform_via_email = bool(int(do_not_inform_via_email))

    # Check if user exists
    try:
        user = user_info(db=db, email=email)
    except Exception:
        raise McAuthChangePasswordException('User with email address "%s" does not exist.' % email)

    password_validation_message = validate_new_password(email=email,
                                                        password=new_password,
                                                        password_repeat=new_password_repeat)
    if password_validation_message:
        raise McAuthChangePasswordException("Unable to change password: %s" % password_validation_message)

    # Hash + validate the password
    try:
        password_new_hash = generate_secure_hash(password=new_password)
    except Exception as ex:
        raise McAuthChangePasswordException("Unable to hash a new password: %s" % str(ex))

    if not password_new_hash:
        raise McAuthChangePasswordException("Generated password hash is empty.")

    # Set the password hash
    db.query("""
        UPDATE auth_users
        SET password_hash = %(password_hash)s,
            active = TRUE
        WHERE email = %(email)s
    """, {
        'email': email,
        'password_hash': password_new_hash,
    })

    if not do_not_inform_via_email:

        message = AuthPasswordChangedMessage(to=email, full_name=user.full_name())
        if not send_email(message):
            raise McAuthChangePasswordException(
                'The password has been changed, but I was unable to send an email notifying you about the change.'
            )

Exemple #26

0

Afficher le fichier

Fichier : handler.py Projet : berkmancenter/mediacloud

    def update_by_id(self, table: str, object_id: int, update_hash: dict) -> Union[Dict[str, Any], None]:
        """Update the row in the table with the given ID. Ignore any fields that start with '_'."""

        # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst
        # noinspection PyTypeChecker
        object_id = decode_object_from_bytes_if_needed(object_id)
        object_id = int(object_id)

        table = decode_object_from_bytes_if_needed(table)
        update_hash = decode_object_from_bytes_if_needed(update_hash)

        update_hash = update_hash.copy()  # To be able to safely modify it

        # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst
        if "submit" in update_hash:
            del update_hash["submit"]

        update_hash = {k: v for k, v in update_hash.items() if not k.startswith("_")}

        if len(update_hash) == 0:
            raise McUpdateByIDException("Hash to UPDATE is empty.")

        primary_key_column = self.primary_key_column(table)
        if not primary_key_column:
            raise McUpdateByIDException("Primary key for table '%s' was not found" % table)

        keys = []
        for key, value in update_hash.items():
            key_value = key

            # Cast Inline::Python's booleans to Python's booleans
            # MC_REWRITE_TO_PYTHON: remove after porting
            if type(value).__name__ == '_perl_obj':
                value = bool(value)
                update_hash[key] = value

            key_value += " = %(" + key + ")s"  # "%(key)s" to be resolved by psycopg2, not Python

            keys.append(key_value)

        update_hash['__object_id'] = object_id

        sql = "UPDATE %s " % table
        sql += "SET %s " % ", ".join(keys)
        sql += "WHERE %s = " % primary_key_column
        sql += "%(__object_id)s"  # "%(__object_id)s" to be resolved by psycopg2, not Python

        try:
            self.query(sql, update_hash)
        except Exception as ex:
            raise McUpdateByIDException("Update to UPDATE hash '%s': %s" % (str(update_hash), str(ex)))

        updated_row = self.find_by_id(table=table, object_id=object_id)

        return updated_row

Exemple #27

0

Afficher le fichier

Fichier : create.py Projet : berkmancenter/mediacloud

def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    if 'content' in story:
        content = story['content']
    else:
        content = _get_test_content()

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={'full_text_rss': False},
        )

    host = get_url_host(feed['url'])

    download = db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'fetching',
            'priority': 1,
            'extracted': False,
            'stories_id': story['stories_id'],
        }
    )

    download = store_content(db=db, download=download, content=content)

    story['download'] = download
    story['content'] = content

    extract_and_process_story(db=db, story=story)

    story['download_text'] = db.query("""
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']}).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story

Exemple #28

0

Afficher le fichier

Fichier : stories.py Projet : berkmancenter/mediacloud

def add_story_and_content_download(db: DatabaseHandler, story: dict, parent_download: dict) -> Optional[dict]:
    """If the story is new, add it to the database and also add a pending download for the story content."""
    story = decode_object_from_bytes_if_needed(story)
    parent_download = decode_object_from_bytes_if_needed(parent_download)

    story = add_story(db=db, story=story, feeds_id=parent_download['feeds_id'])

    if story is not None:
        _create_child_download_for_story(db=db, story=story, parent_download=parent_download)

    return story

Exemple #29

0

Afficher le fichier

Fichier : user.py Projet : berkmancenter/mediacloud

    def __init__(self,
                 email: str,
                 full_name: str = None,
                 notes: str = None,
                 active: bool = None,
                 weekly_requests_limit: int = None,
                 weekly_requested_items_limit: int = None,
                 password: str = None,
                 password_repeat: str = None,
                 role_ids: List[int] = None,
                 subscribe_to_newsletter: bool = None,
                 activation_url: str = None):

        if not full_name:
            raise McAuthUserException("User full name is unset.")

        if notes is None:
            raise McAuthUserException("User notes are undefined (should be at least an empty string).")

        if not isinstance(role_ids, list):
            raise McAuthUserException("List of role IDs is not an array.")

        if not password:
            raise McAuthUserException("Password is unset.")

        if not password_repeat:
            raise McAuthUserException("Password repeat is unset.")

        # Password will be verified by ::NewOrModifyUser

        # Either activate the user right away, or make it inactive and send out an email with activation link
        if (active and activation_url) or (not active and not activation_url):
            raise McAuthUserException("Either make the user active or set the activation URL.")

        super().__init__(
            email=email,
            full_name=full_name,
            notes=notes,
            active=active,
            weekly_requests_limit=weekly_requests_limit,
            weekly_requested_items_limit=weekly_requested_items_limit,
            password=password,
            password_repeat=password_repeat,
            role_ids=role_ids,
        )

        if isinstance(subscribe_to_newsletter, bytes):
            subscribe_to_newsletter = decode_object_from_bytes_if_needed(subscribe_to_newsletter)
        subscribe_to_newsletter = bool(int(subscribe_to_newsletter or 0))

        activation_url = decode_object_from_bytes_if_needed(activation_url)

        self.__subscribe_to_newsletter = subscribe_to_newsletter
        self.__activation_url = activation_url

Exemple #30

0

Afficher le fichier

Fichier : user.py Projet : berkmancenter/mediacloud

    def __init__(self, role_id: int, role_name: str):
        if isinstance(role_id, bytes):
            role_id = decode_object_from_bytes_if_needed(role_id)
        role_name = decode_object_from_bytes_if_needed(role_name)

        if not role_id:
            raise McAuthUserException("Role ID is unset.")
        if not role_name:
            raise McAuthUserException("Role name is unset.")

        self.__role_id = role_id
        self.__role_name = role_name

Exemple #31

0

Afficher le fichier

Fichier : __init__.py Projet : zhanglipku/mediacloud

    def fetch_annotation_for_story(self, db: DatabaseHandler, stories_id: int) -> Union[dict, list, None]:
        """Fetch the annotation from key-value store for the story, or None if story is not annotated."""

        if not self.annotator_is_enabled():
            fatal_error("Annotator is not enabled in the configuration.")

        # MC_REWRITE_TO_PYTHON: remove after rewrite to Python
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        stories_id = int(stories_id)

        if not self.story_is_annotated(db=db, stories_id=stories_id):
            log.warning("Story %d is not annotated." % stories_id)
            return None

        json = self.__postgresql_store.fetch_content(db=db, object_id=stories_id)
        if json is None:
            raise McJSONAnnotatorException("Fetched annotation is undefined or empty for story %d." % stories_id)

        json = json.decode('utf-8')

        try:
            annotation = decode_json(json)
            if annotation is None:
                raise McJSONAnnotatorException("Annotation is None after decoding from JSON.")
        except Exception as ex:
            raise McJSONAnnotatorException(
                "Unable to parse annotation JSON for story %d: %s\nString JSON: %s" % (stories_id, str(ex), json,)
            )

        try:
            annotation = self._preprocess_stored_annotation(annotation)
            if annotation is None:
                raise McJSONAnnotatorException("Annotation is None after preprocessing.")
        except Exception as ex:
            fatal_error(
                "Unable to preprocess stored annotation for story %d: %s\nString JSON: %s" %
                (stories_id, str(ex), json,)
            )

        return annotation

Exemple #32

0

Afficher le fichier

    def stem_words(self, words: List[str]) -> List[str]:
        """Stem list of words with PyStemmer."""
        language_code = self.language_code()
        words = decode_object_from_bytes_if_needed(words)

        # Normalize apostrophe so that "it’s" and "it's" get treated identically (it's being done in
        # _tokenize_with_spaces() too but let's not assume that all tokens that are to be stemmed go through sentence
        # tokenization first)
        words = [word.replace("’", "'") for word in words]

        if language_code is None:
            raise McLanguageException("Language code is None.")

        if words is None:
            raise McLanguageException("Words to stem is None.")

        # (Re-)initialize stemmer if needed
        if self.__pystemmer is None:

            try:
                self.__pystemmer = PyStemmer(language_code)
            except Exception as ex:
                raise McLanguageException(
                    "Unable to initialize PyStemmer for language '%s': %s" % (
                        language_code,
                        str(ex),
                    ))

        stems = self.__pystemmer.stemWords(words)

        if len(words) != len(stems):
            log.warning(
                "Stem count is not the same as word count; words: %s; stems: %s"
                % (
                    str(words),
                    str(stems),
                ))

        # Perl's Snowball implementation used to return lowercase stems
        stems = [stem.lower() for stem in stems]

        return stems

Exemple #33

0

Afficher le fichier

    def tokenize_sentence_to_words(self, sentence: str) -> list:
        """Tokenize Japanese sentence into words.
        
        Removes punctuation and words that don't belong to part-of-speech whitelist."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__mecab.parse(sentence).strip()
        parsed_tokens = parsed_text.split("\n")

        allowed_pos_ids = self._mecab_allowed_pos_ids()

        words = []
        for parsed_token_line in parsed_tokens:
            if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line:

                primary_form_and_pos_number = parsed_token_line.split(
                    self.__MECAB_TOKEN_POS_SEPARATOR)

                primary_form = primary_form_and_pos_number[0]
                pos_number = primary_form_and_pos_number[1]

                if pos_number.isdigit():
                    pos_number = int(pos_number)

                    if pos_number in allowed_pos_ids:
                        words.append(primary_form)

            else:
                # Ignore all the "EOS" stuff
                pass

        return words

Exemple #34

0

Afficher le fichier

Fichier : schema.py Projet : ColCarroll/mediacloud

def recreate_db(label: str = None) -> None:
    """(Re)create database schema."""
    def reset_all_schemas(db_: DatabaseHandler) -> None:
        """Recreate all schemas."""

        schemas = db_.query(
            """
            SELECT schema_name
            FROM information_schema.schemata
            WHERE schema_name NOT LIKE %(schema_pattern)s
              AND schema_name != 'information_schema'
            ORDER BY schema_name
        """, {
                'schema_pattern': 'pg_%'
            }).flat()

        # When dropping schemas, PostgreSQL spits out a lot of notices which break "no warnings" unit test
        db_.query('SET client_min_messages=WARNING')

        for schema in schemas:
            db_.query('DROP SCHEMA IF EXISTS %s CASCADE' % schema)

        db_.query('SET client_min_messages=NOTICE')

    # ---

    label = decode_object_from_bytes_if_needed(label)

    db = connect_to_db(label=label, do_not_check_schema_version=True)

    log.info("Resetting all schemas...")
    reset_all_schemas(db_=db)

    db.set_show_error_statement(True)

    mediawords_sql_path = mc_sql_schema_path()
    log.info("Importing from %s..." % mediawords_sql_path)
    with open(mediawords_sql_path, 'r') as mediawords_sql_f:
        mediawords_sql = mediawords_sql_f.read()
        db.query(mediawords_sql)

    log.info("Done.")

Exemple #35

0

Afficher le fichier

def is_homepage_url(url: str) -> bool:
    """Returns true if URL is homepage (e.g. http://www.wired.com/) and not a child page
    (e.g. http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/)."""
    url = decode_object_from_bytes_if_needed(url)
    if url is None:
        log.debug("URL is None.")
        return False
    if len(url) == 0:
        log.debug("URL is empty.")
        return False

    url = fix_common_url_mistakes(url)

    if not is_http_url(url):
        log.debug("URL '%s' is invalid." % url)
        return False

    # Remove cruft from the URL first
    try:
        url = normalize_url(url)
    except McNormalizeURLException as ex:
        log.debug(
            "Unable to normalize URL '%s' before checking if it's a homepage: %s"
            % (url, ex))
        return False

    # The shortened URL may lead to a homepage URL, but the shortened URL
    # itself is not a homepage URL
    if is_shortened_url(url):
        return False

    # If we still have something for a query of the URL after the
    # normalization, always assume that the URL is *not* a homepage
    uri = furl(url)
    if len(str(uri.query)) > 0:
        return False

    for homepage_url_path_regex in __HOMEPAGE_URL_PATH_REGEXES:
        if re.search(homepage_url_path_regex, str(uri.path)):
            return True

    return False

Exemple #36

0

Afficher le fichier

Fichier : __init__.py Projet : vishalbelsare/mediacloud

def get_url_distinctive_domain(url: str) -> str:
    """Return a truncated form of URL's host (domain) that distinguishes it from others, e.g.:

    * www.whitehouse.gov => whitehouse.gov
    * www.blogspot.com => blogspot.com
    * kardashian.blogspot.com => kardashian.blogspot.com

    Return original URL if unable to process the URL."""

    try:
        url = decode_object_from_bytes_if_needed(url)

        host = get_url_host(url)
        if host is None:
            return url

        name_parts = host.split('.')
        n = len(name_parts) - 1

        if re.search(r'\.(gov|org|com?)\...$', host, re.I):
            # foo.co.uk -> foo.co.uk instead of co.uk
            parts = [
                str(name_parts[n - 2]),
                str(name_parts[n - 1]),
                str(name_parts[n])
            ]
            domain = '.'.join(parts)
        elif re.search(
                r'\.go\.com|\.wordpress\.com|\.blogspot\.|\.livejournal\.com|\.privet\.ru|\.wikia\.com'
                r'|\.feedburner\.com|\.24open\.ru|\.patch\.com|\.tumblr\.com',
                host, re.I):
            # identify sites in these domains as the whole host name (abcnews.go.com instead of go.com)
            domain = host
        else:
            parts = [str(name_parts[n - 1] or ''), str(name_parts[n] or '')]
            domain = '.'.join(parts)

        return domain.lower()

    except Exception as ex:
        log.debug("get_url_distinctive_domain falling back to url: " + str(ex))
        return str(url).lower()

Exemple #37

0

Afficher le fichier

def process_download_for_extractor(
    db: DatabaseHandler,
    download: dict,
    extractor_args: PyExtractorArguments = PyExtractorArguments()
) -> None:
    """Extract the download and create the resulting download_text entry. If there are no remaining downloads to be
    extracted for the story, call process_extracted_story() on the parent story."""

    download = decode_object_from_bytes_if_needed(download)

    stories_id = download['stories_id']

    log.debug("extract: {} {} {}".format(download['downloads_id'], stories_id,
                                         download['url']))

    extract_and_create_download_text(db=db,
                                     download=download,
                                     extractor_args=extractor_args)

    has_remaining_download = db.query(
        """
        SELECT downloads_id
        FROM downloads
        WHERE stories_id = %(stories_id)s
          AND extracted = 'f'
          AND type = 'content'
    """, {
            'stories_id': stories_id
        }).hash()

    # MC_REWRITE_TO_PYTHON: Perlism
    if has_remaining_download is None:
        has_remaining_download = {}

    if len(has_remaining_download) > 0:
        log.info("Pending more downloads...")

    else:
        story = db.find_by_id(table='stories', object_id=stories_id)
        process_extracted_story(db=db,
                                story=story,
                                extractor_args=extractor_args)

Exemple #38

0

Afficher le fichier

Fichier : extract_text.py Projet : oakieoak/mediacloud

def extract_article_from_html(html: str) -> str:
    """Extract article HTML from a full HTML file."""
    # FIXME move HTML stripping here too
    html = decode_object_from_bytes_if_needed(html)
    if html is None or html == '':
        return ''

    try:
        doc = readability.readability.Document(html)

        doc_title = doc.short_title().strip()
        doc_summary = doc.summary().strip()

        extracted_text = "%s\n\n%s" % (doc_title, doc_summary)

    except Exception as ex:
        l.error('Exception raised while extracting HTML: %s' % str(ex))
        extracted_text = ''

    return extracted_text

Exemple #39

0

Afficher le fichier

Fichier : database_inline.py Projet : zhanglipku/mediacloud

    def fetch_content(self, db: DatabaseHandler, object_id: int, object_path: str = None) -> bytes:
        """Read object from PostgreSQL's 'path' row."""

        object_id = self._prepare_object_id(object_id)

        object_path = decode_object_from_bytes_if_needed(object_path)

        if object_path is None:
            raise McDatabaseInlineStoreException("Object path for object ID %d is None." % object_id)

        if not object_path.startswith(self.__CONTENT_PREFIX):
            raise McDatabaseInlineStoreException(
                "Object path for object ID %d is invalid: %s" % (object_id, object_path,)
            )

        object_path = object_path[len(self.__CONTENT_PREFIX):]

        content = object_path.encode('utf-8')

        return content

Exemple #40

0

Afficher le fichier

Fichier : parse_json.py Projet : umatter/mediacloud

def decode_json(json_string: str) -> Union[dict, list]:
    """Decode JSON to dictionary or list."""

    json_string = decode_object_from_bytes_if_needed(json_string)

    if json_string is None:
        raise McDecodeJSONException("JSON string is None.")

    if len(json_string) == 0:
        raise McDecodeJSONException("JSON string is empty.")

    try:
        json_obj = json.loads(json_string)
    except Exception as ex:
        raise McDecodeJSONException("Unable to decode string %s from JSON: %s" % (str(json_string), str(ex)))

    if json_obj is None:
        raise McEncodeJSONException("Resulting JSON object is None for string: %s" % (str(json_string),))

    return json_obj

Exemple #41

0

Afficher le fichier

def _delete_story_sentences(db: DatabaseHandler, story: dict) -> None:
    """Delete any existing stories for the given story and also update media_stats to adjust for the deletion."""
    story = decode_object_from_bytes_if_needed(story)

    num_deleted = db.query("""
        DELETE FROM story_sentences
        WHERE stories_id = %(stories_id)s
    """, {'stories_id': story['stories_id']}).rows()

    if num_deleted > 0:
        db.query("""
            UPDATE media_stats
            SET num_sentences = num_sentences - %(num_deleted)s
            WHERE media_id = %(media_id)s
              AND stat_date = %(publish_date)s::date
        """, {
            'num_deleted': num_deleted,
            'media_id': story['media_id'],
            'publish_date': story['publish_date'],
        })

Exemple #42

0

Afficher le fichier

def fix_common_url_mistakes(url: str) -> Optional[str]:
    """Fixes common URL mistakes (mistypes, etc.)."""
    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        return None

    # Fix broken URLs that look like this: http://http://www.al-monitor.com/pulse
    url = re.sub(r'(https?://)https?:?//', r"\1", url, flags=re.I)

    # Fix URLs with only one slash after "http" ("http:/www.")
    url = re.sub(r'(https?:/)(www)', r"\1/\2", url, flags=re.I)

    # replace backslashes with forward
    url = re.sub(r'\\', r'/', url)

    # http://newsmachete.com?page=2 -> http://newsmachete.com/?page=2
    url = re.sub(r'(https?://[^/]+)\?', r"\1/?", url)

    return url

Exemple #43

0

Afficher le fichier

def create_test_topic(db: DatabaseHandler, label: str) -> dict:
    """Create test topic with a simple label."""

    label = decode_object_from_bytes_if_needed(label)

    return db.create(
        table='topics',
        insert_hash={
            'name': label,
            'description': label,
            'pattern': label,
            'solr_seed_query': label,
            'solr_seed_query_run': True,
            'start_date': '2016-01-01',
            'end_date': '2016-03-01',
            'job_queue': 'mc',
            'max_stories': 100000,
            'platform': 'web'
        }
    )

Exemple #44

0

Afficher le fichier

def run_topics_fetch_link(topic_fetch_urls_id: int,
                          domain_timeout: Optional[int] = None) -> None:
    """Fetch a link for a topic and either match it to an existing story or generate a story from it.

    Almost all of the interesting functionality here happens in fetch_topic_url(). The code here just deals with
    routing, including requeueing responses throttled by mediawords.util.web.user_agent.throttled."""
    global _consecutive_requeues

    if isinstance(topic_fetch_urls_id, bytes):
        topic_fetch_urls_id = decode_object_from_bytes_if_needed(
            topic_fetch_urls_id)
    topic_fetch_urls_id = int(topic_fetch_urls_id)

    if topic_fetch_urls_id is None:
        raise McFetchLinkJobException("'topic_fetch_urls_id' is None.")

    log.info("Start fetch for topic_fetch_url %d" % topic_fetch_urls_id)

    db = connect_to_db()

    try:
        if not fetch_topic_url_update_state(
                db=db,
                topic_fetch_urls_id=topic_fetch_urls_id,
                domain_timeout=domain_timeout):
            JobBroker(queue_name=QUEUE_NAME).add_to_queue(
                topic_fetch_urls_id=topic_fetch_urls_id)

            _consecutive_requeues += 1
            if _consecutive_requeues > REQUEUES_UNTIL_SLEEP:
                log.info("sleeping after %d consecutive retries ..." %
                         _consecutive_requeues)
                time.sleep(1)

    except Exception as ex:
        # Error has already been logged by fetch_topic_url_update_state(), so we only need to work out the
        # "consecutive retries" here
        log.error(f"Fetching URL for ID {topic_fetch_urls_id} failed: {ex}")
        _consecutive_requeues = 0

    log.info("Finished fetch for topic_fetch_url %d" % topic_fetch_urls_id)

Exemple #45

0

Afficher le fichier

Fichier : extractor_version.py Projet : zhanglipku/mediacloud

def update_extractor_version_tag(db: DatabaseHandler, story: dict) -> None:
    """Add extractor version tag to the story."""
    # FIXME no caching because unit tests run in the same process so a cached tag set / tag will not be recreated.
    # Purging such a cache manually is very error-prone.

    story = decode_object_from_bytes_if_needed(story)

    tag_set = db.find_or_create(
        table='tag_sets',
        insert_hash={'name': extractor_version_tag_sets_name()})

    db.query(
        """
        DELETE FROM stories_tags_map AS stm
            USING tags AS t
                JOIN tag_sets AS ts
                    ON ts.tag_sets_id = t.tag_sets_id
        WHERE t.tags_id = stm.tags_id
          AND ts.tag_sets_id = %(tag_sets_id)s
          AND stm.stories_id = %(stories_id)s
    """, {
            'tag_sets_id': tag_set['tag_sets_id'],
            'stories_id': story['stories_id'],
        })

    extractor_version = extractor_name()
    tag = db.find_or_create(table='tags',
                            insert_hash={
                                'tag': extractor_version,
                                'tag_sets_id': tag_set['tag_sets_id']
                            })
    tags_id = tag['tags_id']

    db.query(
        """
        INSERT INTO stories_tags_map (stories_id, tags_id)
        VALUES (%(stories_id)s, %(tags_id)s)
    """, {
            'stories_id': story['stories_id'],
            'tags_id': tags_id
        })

Exemple #46

0

Afficher le fichier

Fichier : word2vec_generate_snapshot_model_worker.py Projet : vishalbelsare/mediacloud

def run_word2vec_generate_snapshot_model(snapshots_id: int) -> None:
    """Generate word2vec model for a given snapshot."""

    # MC_REWRITE_TO_PYTHON: remove after Python rewrite
    if isinstance(snapshots_id, bytes):
        snapshots_id = decode_object_from_bytes_if_needed(snapshots_id)

    if snapshots_id is None:
        raise McWord2vecGenerateSnapshotModelException(
            "'snapshots_id' is None.")

    snapshots_id = int(snapshots_id)

    db = connect_to_db()

    # FIXME might be more efficient to pass topics_id as a parameter
    topics_id = db.query(
        """
        SELECT topics_id
        FROM snapshots
        WHERE snapshots_id = %(snapshots_id)s
    """, {
            'snapshots_id': snapshots_id
        }).flat()[0]

    log.info(
        f"Generating word2vec model for topic {topics_id}, snapshot {snapshots_id}..."
    )

    sentence_iterator = SnapshotSentenceIterator(db=db,
                                                 topics_id=topics_id,
                                                 snapshots_id=snapshots_id)
    model_store = SnapshotDatabaseModelStore(db=db,
                                             topics_id=topics_id,
                                             snapshots_id=snapshots_id)
    train_word2vec_model(sentence_iterator=sentence_iterator,
                         model_store=model_store)

    log.info(
        f"Finished generating word2vec model for topic {topics_id}, snapshot {snapshots_id}."
    )

Exemple #47

0

Afficher le fichier

Fichier : __init__.py Projet : zhanglipku/mediacloud

def role_id_for_role(db: DatabaseHandler, role: str) -> int:
    """Fetch a user role's ID for a role; raise if no such role was found."""
    role = decode_object_from_bytes_if_needed(role)

    if not role:
        raise McRoleIDForRoleException("Role is empty.")

    auth_roles_id = db.query(
        """
        SELECT auth_roles_id
        FROM auth_roles
        WHERE role = %(role)s
        LIMIT 1
    """, {
            'role': role
        }).flat()

    if (not auth_roles_id) or (not len(auth_roles_id)):
        raise McRoleIDForRoleException("Role '%s' was not found." % role)

    return int(auth_roles_id[0])

Exemple #48

0

Afficher le fichier

def get_url_host(url: str) -> str:
    """Return hostname of an URL. If we can't parse out the host name, just return the URL."""
    url = decode_object_from_bytes_if_needed(url)
    if url is None:
        raise McGetURLHostException("URL is None")
    if len(url) == 0:
        raise McGetURLHostException("URL is empty")

    url = fix_common_url_mistakes(url)

    if not is_http_url(url):
        return url

    uri = furl(url)

    host = uri.host

    if host is not None and len(host) > 0:
        return host
    else:
        return url

Exemple #49

0

Afficher le fichier

Fichier : profile.py Projet : aayushagrawal135/mediacloud

def delete_user(db: DatabaseHandler, email: str) -> None:
    """Delete user."""

    email = decode_object_from_bytes_if_needed(email)

    if not email:
        raise McAuthProfileException('Email address is empty.')

    # Check if user exists
    try:
        user_info(db=db, email=email)
    except Exception as _:
        raise McAuthProfileException(
            "User with email address '%s' does not exist." % email)

    # Delete the user (PostgreSQL's relation will take care of 'auth_users_roles_map')
    db.query(
        """
        DELETE FROM auth_users
        WHERE email = %(email)s
    """, {'email': email})

Exemple #50

0

Afficher le fichier

    def update_job_state_message(self, db: DatabaseHandler,
                                 message: str) -> None:
        """
        Update the message field for the current "job_states" row.

        This is a public method that is intended to be used by code run anywhere above the stack from run() to publish
        messages updating the progress of a long running job.
        """
        message = decode_object_from_bytes_if_needed(message)

        # Verify that it exists I guess?
        db.require_by_id(table='job_states', object_id=self.__job_states_id)

        job_state = db.update_by_id(table='job_states',
                                    object_id=self.__job_states_id,
                                    update_hash={
                                        'message': message,
                                        'last_updated': sql_now(),
                                    })

        self.__update_table_state(db=db, job_state=job_state)

Exemple #51

0

Afficher le fichier

Fichier : __init__.py Projet : zhanglipku/mediacloud

    def story_is_annotatable(self, db: DatabaseHandler, stories_id: int) -> bool:
        """Check if story can be annotated."""

        if not self.annotator_is_enabled():
            raise McJSONAnnotatorException("Annotator is not enabled in the configuration.")

        # MC_REWRITE_TO_PYTHON: remove after rewrite to Python
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        stories_id = int(stories_id)

        story = db.query("""
            SELECT story_is_english_and_has_sentences
            FROM story_is_english_and_has_sentences(%(stories_id)s)
        """, {'stories_id': stories_id}).hash()

        if story is not None and int(story['story_is_english_and_has_sentences']) == 1:
            return True
        else:
            return False

Exemple #52

0

Afficher le fichier

Fichier : create.py Projet : timyrankinen/mediacloud

def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict:
    feed = decode_object_from_bytes_if_needed(feed)

    priority = 0
    if 'last_attempted_download_time' not in feed:
        priority = 10

    host = get_url_host(url=feed['url'])

    return db.create(table='downloads',
                     insert_hash={
                         'feeds_id': int(feed['feeds_id']),
                         'url': feed['url'],
                         'host': host,
                         'type': 'feed',
                         'sequence': 1,
                         'state': 'pending',
                         'priority': priority,
                         'download_time': 'NOW()',
                         'extracted': False,
                     })

Exemple #53

0

Afficher le fichier

def _add_story_tags_to_stories(db: DatabaseHandler, stories: List[Dict[str, Any]]) -> None:
    """Add story tags to stories for Solr indexing."""
    stories = decode_object_from_bytes_if_needed(stories)

    tags = []
    num_tags = 5

    for i in range(1, num_tags + 1):
        tags.append(lookup_or_create_tag(db=db, tag_name=f"test:test_{i}"))

    for story in stories:
        assert isinstance(story, dict)
        tag = tags.pop()
        tags.insert(0, tag)
        db.query("""
            INSERT INTO stories_tags_map (stories_id, tags_id)
            VALUES (%(stories_id)s, %(tags_id)s)
        """, {
            'stories_id': story['stories_id'],
            'tags_id': tag['tags_id'],
        })

Exemple #54

0

Afficher le fichier

def _get_first_download(db: DatabaseHandler, story: dict) -> dict:
    """Get the first download linking to this story."""

    story = decode_object_from_bytes_if_needed(story)

    first_download = db.query(
        """
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
        ORDER BY sequence ASC
        LIMIT 1
    """, {
            'stories_id': story['stories_id']
        }).hash()

    # MC_REWRITE_TO_PYTHON: Perlism
    if first_download is None:
        first_download = {}

    return first_download

Exemple #55

0

Afficher le fichier

Fichier : process.py Projet : oakieoak/mediacloud

def run_command_in_foreground(command: List[str]) -> None:
    """Run command in foreground, raise McRunCommandInForegroundException if it fails."""
    l.debug("Running command: %s" % ' '.join(command))

    command = decode_object_from_bytes_if_needed(command)

    # Add some more PATHs to look into
    env_path = os.environ.copy()
    env_path[
        'PATH'] = '/usr/local/bin:/usr/local/sbin:/usr/bin:/usr/sbin:/bin:/sbin:' + env_path[
            'PATH']

    # noinspection PyBroadException
    try:
        if sys.platform.lower() == 'darwin':
            # OS X -- requires some crazy STDOUT / STDERR buffering
            line_buffered = 1
            process = subprocess.Popen(command,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.STDOUT,
                                       bufsize=line_buffered,
                                       env=env_path)
            while True:
                output = process.stdout.readline()
                if len(output) == 0 and process.poll() is not None:
                    break
                l.info(output.strip())
            rc = process.poll()
            if rc > 0:
                raise McRunCommandInForegroundException(
                    "Process returned non-zero exit code %d" % rc)
        else:
            # assume Ubuntu
            subprocess.check_call(command, env=env_path)
    except subprocess.CalledProcessError as ex:
        raise McRunCommandInForegroundException(
            "Process returned non-zero exit code %d" % ex.returncode)
    except Exception as ex:
        raise McRunCommandInForegroundException(
            "Error while running command: %s" % str(ex))

Exemple #56

0

Afficher le fichier

Fichier : stories.py Projet : shafiahmed/mediacloud

def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None:
    """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls."""
    topic = decode_object_from_bytes_if_needed(topic)

    stories = db.query(
        """
        select s.*
            from stories s, topic_stories ts, media m
            where
                s.stories_id = ts.stories_id and
                s.media_id = m.media_id and
                m.foreign_rss_links = true and
                ts.topics_id = %(a)s and
                not ts.valid_foreign_rss_story
        """,
        {'a': topic['topics_id']}).hashes()

    for story in stories:
        download = db.query(
            "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
            {'a': story['stories_id']}).hash()

        content = ''
        try:
            content = mediawords.dbi.downloads.fetch_content(db, download)
        except Exception:
            pass

        db.begin()
        db.create('topic_seed_urls', {
            'url': story['url'],
            'topics_id': topic['topics_id'],
            'source': 'merge_foreign_rss_stories',
            'content': content
        })

        db.query(
            "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
            {'a': story['stories_id'], 'b': topic['topics_id']})
        db.commit()

Exemple #57

0

Afficher le fichier

def language_code_for_text(text: str):
    """Returns an ISO 690 language code for the plain text passed as a parameter.

    :param text: Text that should be identified
    :return: ISO 690 language code (e.g. 'en') on successful identification, empty string ('') on failure
    """
    text = decode_object_from_bytes_if_needed(text)

    if not text:
        return ''

    if len(text) > __MAX_TEXT_LENGTH:
        log.warning("Text is longer than %d, trimming..." % __MAX_TEXT_LENGTH)
        text = text[:__MAX_TEXT_LENGTH]

    # We need to verify that the file can cleany encode and decode because CLD can segfault on bad UTF-8
    text = __recode_utf8_string(text)

    try:
        is_reliable, text_bytes_found, details = cld2.detect(
            utf8Bytes=text, useFullLangTables=True)
    except Exception as ex:
        log.error("Error while detecting language: %s" % str(ex))
        return ''

    if not details:
        return ''

    best_match = details[0]
    language_name = best_match.language_name.lower()
    language_code = best_match.language_code.lower()

    if language_name in {'unknown', 'tg_unknown_language'
                         } or language_code == 'un':
        return ''

    if not language_is_supported(language_code):
        return ''

    return language_code

Exemple #58

0

Afficher le fichier

Fichier : pages.py Projet : tidehc/mediacloud

    def __execute(self, cursor: DictCursor, query: str, page: int, rows_per_page: int,
                  double_percentage_sign_marker: str) -> None:

        query = decode_object_from_bytes_if_needed(query)

        if page < 1:
            raise McQueryPagedHashesException('Page must be 1 or bigger.')

        offset = (page - 1) * rows_per_page

        query = "%(original_query)s LIMIT ( %(rows_per_page)d + 1 ) OFFSET %(offset)s" % {
            'original_query': query,
            'rows_per_page': rows_per_page,
            'offset': offset,
        }

        query_args = [query]
        query_args = convert_dbd_pg_arguments_to_psycopg2_format(*query_args)

        # Query
        rs = DatabaseResult(cursor=cursor,
                            query_args=query_args,
                            double_percentage_sign_marker=double_percentage_sign_marker)

        hashes = rs.hashes()

        # Truncate
        one_more_page = False
        if len(hashes) > rows_per_page:
            one_more_page = True
            del hashes[rows_per_page:]

        hashes_size = offset + len(hashes)
        if one_more_page:
            hashes_size += 1

        pager = Pages(total_entries=hashes_size, entries_per_page=rows_per_page, current_page=page)

        self.__list = hashes
        self.__pager = pager

Exemple #59

0

Afficher le fichier

def wait_for_tcp_port_to_open(port: int,
                              hostname: str = 'localhost',
                              retries: int = 60,
                              delay: Union[int, float] = 1) -> bool:
    """Try connecting to TCP port until it opens (or not); return True if managed to connect."""

    hostname = decode_object_from_bytes_if_needed(hostname)

    port_is_open = False
    for retry in range(retries):
        if retry == 0:
            log.debug("Trying to connect to %s:%d" % (hostname, port))
        else:
            log.debug("Trying to connect to %s:%d, retry %d" %
                      (hostname, port, retry))

        if tcp_port_is_open(port, hostname):
            port_is_open = True
            break
        else:
            time.sleep(delay)
    return port_is_open

Exemple #60

0

Afficher le fichier

def tcp_port_is_open(port: int, hostname: str = 'localhost') -> bool:
    """Test if TCP port is open."""

    hostname = decode_object_from_bytes_if_needed(hostname)

    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.settimeout(2)
    try:
        result = sock.connect_ex((hostname, port))
    except socket.gaierror as ex:
        log.warning(f"Unable to resolve {hostname}: {ex}")
        return False

    if result == 0:
        try:
            sock.shutdown(socket.SHUT_RDWR)
        except OSError as ex:
            # Quiet down "OSError: [Errno 57] Socket is not connected"
            log.warning("Error while shutting down socket: %s" % str(ex))

    sock.close()
    return result == 0