コード例 #1
0
ファイル: ingest.py プロジェクト: abelsonlive/newslynx-core
    def _content_items():
        content_query = """
        SELECT '{}' AS source_id, id FROM content
        WHERE (url in ({}) or id in ({}))
        AND org_id = {}
        """
        queries = []
        for source_id, vals in meta.iteritems():
            links = ",".join(
                ["'%s'" % l for l in uniq(meta[source_id].pop('links', []))])
            ids = ",".join(
                [str(i) for i in uniq(meta[source_id].pop('content_item_ids', []))])
            if len(links) or len(ids):

                # THIS IS KIND OF A HACK FOR NOW.
                if not links:
                    links = "'__null___'"
                if not ids:
                    ids = '-99999'
                queries.append(
                    content_query.format(source_id, links, ids, org_id))

        # execute query + modify meta.
        if len(queries):
            q = " UNION ALL ".join(queries)
            for row in ResultIter(db.session.execute(q)):
                src_id = row['source_id']
                k = 'content_item_ids'
                if k not in meta[src_id]:
                    meta[src_id][k] = []
                meta[src_id][k].append(row['id'])
        db.session.commit()
        db.session.close()
        db.session.remove()
コード例 #2
0
 def queries(self):
     """
     Programmatically generate search queries based on a org's domains
     """
     domains = self.org.get('domains', [])
     domains.extend(self.settings.get('short_urls', []))
     domains.extend(self.settings.get('short_domains', []))
     domains = uniq(domains)
     _queries = []
     for d in domains:
         term = d.replace(".", " ").strip().lower()
         q = '"{}" filter:links'.format(term)
         _queries.append(q)
     if not len(_queries):
         raise RequestError('This Org has no domains.')
     return uniq(_queries)
コード例 #3
0
def from_html(htmlstring, source=None):
    """
    Extract all img urls from an html string
    """
    if not htmlstring:
        return []
    soup = BeautifulSoup(htmlstring)
    out_imgs = []

    for tag, attr in IMG_TAGS:

        for el in soup.find_all(tag):

            img_url = el.attrs.get(attr)
            if not img_url:
                continue

            # only take images with known formats
            fmt = url.is_image(img_url)
            if not fmt:
                continue

            # absolutify images if we know their source.
            if img_url.startswith('/') or not img_url.startswith('http'):
                if source:
                    img_url = urljoin(source, img_url)
                else:
                    continue

            out_imgs.append(img_url)
    return uniq(out_imgs)
コード例 #4
0
 def meta(self):
     d = {
         'followers': self._user.get('followers_count'),
         'friends': self._user.get('friends_count'),
         'hashtags': uniq([h['text'] for h in self._entities.get('hashtags', [])])
     }
     if self.incl_embed:
         d['embed'] = self.embed
     return d
コード例 #5
0
ファイル: rss.py プロジェクト: eads/newslynx-core
 def get_authors(self, entry):
     """
     return all candidates, and parse unique
     """
     authors = []
     candidates = self.get_candidates(entry, AUTHOR_CANDIDATE_JSONPATH)
     for c in candidates:
         for a in author.parse(c):
             authors.append(a)
     return uniq(authors)
コード例 #6
0
 def get_authors(self, entry):
     """
     return all candidates, and parse unique
     """
     authors = []
     candidates = self.get_candidates(entry, AUTHOR_CANDIDATE_JSONPATH)
     for c in candidates:
         for a in author.parse(c):
             authors.append(a)
     return uniq(authors)
コード例 #7
0
ファイル: ingest.py プロジェクト: newslynx/newslynx-core
    def _content_items():
        content_query = """
        SELECT '{}' AS source_id, id FROM content
        WHERE (url in ({}) or id in ({}))
        AND org_id = {}
        """
        queries = []
        for source_id, vals in meta.iteritems():
            links = ",".join(
                ["'%s'" % l for l in uniq(meta[source_id].pop('links', []))])
            ids = ",".join([
                str(i)
                for i in uniq(meta[source_id].pop('content_item_ids', []))
            ])
            if len(links) or len(ids):

                # THIS IS KIND OF A HACK FOR NOW.
                if not links:
                    links = "'__null___'"
                if not ids:
                    ids = '-99999'
                queries.append(
                    content_query.format(source_id, links, ids, org_id))

        # execute query + modify meta.
        if len(queries):
            q = " UNION ALL ".join(queries)
            for row in ResultIter(db.session.execute(q)):
                src_id = row['source_id']
                k = 'content_item_ids'
                if k not in meta[src_id]:
                    meta[src_id][k] = []
                meta[src_id][k].append(row['id'])
        db.session.commit()
        db.session.close()
        db.session.remove()
コード例 #8
0
ファイル: rss.py プロジェクト: eads/newslynx-core
    def get_candidates(self, obj, jsonpaths):
        """
        evaluate an object with jsonpaths,
        and get all unique vals / lists
        of values
        """
        candidates = []
        for path in jsonpaths:
            path_candidates = self.get_jsonpath(obj, path)

            if isinstance(path_candidates, list):
                for candidate in path_candidates:
                    if candidate:
                        candidates.append(candidate)

            elif isinstance(path_candidates, str):
                candidates.append(candidate)

        return uniq(candidates)
コード例 #9
0
    def get_candidates(self, obj, jsonpaths):
        """
        evaluate an object with jsonpaths,
        and get all unique vals / lists
        of values
        """
        candidates = []
        for path in jsonpaths:
            path_candidates = self.get_jsonpath(obj, path)

            if isinstance(path_candidates, list):
                for candidate in path_candidates:
                    if candidate:
                        candidates.append(candidate)

            elif isinstance(path_candidates, str):
                candidates.append(candidate)

        return uniq(candidates)
コード例 #10
0
    def links(self):
        """
        Extract all links
        """
        urls = []
        if self.post.get('link'):
            urls.append(self.post['link'])

        if self.post.get('source'):
            urls.append(self.post['source'])

        if self.post.get('message'):
            msg_urls = url.from_string(self.post['message'])
            urls.extend(msg_urls)

        if self.post.get('descrption'):
            desc_urls = url.from_string(self.post['message'])
            urls.extend(desc_urls)

        return uniq(urls)
コード例 #11
0
    def links(self):
        """
        Extract all links
        """
        urls = []
        if self.post.get('link'):
            urls.append(self.post['link'])

        if self.post.get('source'):
            urls.append(self.post['source'])

        if self.post.get('message'):
            msg_urls = url.from_string(self.post['message'])
            urls.extend(msg_urls)

        if self.post.get('descrption'):
            desc_urls = url.from_string(self.post['message'])
            urls.extend(desc_urls)

        return uniq(urls)
コード例 #12
0
ファイル: ingest.py プロジェクト: abelsonlive/newslynx-core
    def _tags():
        tag_query = """
        SELECT '{0}' AS uniqkey, id FROM tags
        WHERE (slug in ({1}) or id in ({2}))
        AND org_id = {3} AND type='subject'
        """
        queries = []
        for uniqkey, vals in meta.iteritems():

            # separate slugs and ids.
            tags = uniq(meta[uniqkey].pop('tag_ids', []))
            ids = []
            slugs = []
            for t in tags:
                try:
                    ids.append(int(t))
                except ValueError:
                    slugs.append(t)

            # format queries.
            slugs = ",".join(["'%s'" % s for s in slugs])
            ids = ",".join([str(i) for i in ids])
            if len(slugs) or len(ids):
                if not slugs:
                    slugs = "'__null___'"
                if not ids:
                    ids = '-99999'
                queries.append(tag_query.format(uniqkey, slugs, ids, org_id))

        # execute query + modify meta.
        if len(queries):
            q = "\nUNION ALL\n".join(queries)
            for row in ResultIter(db.session.execute(q)):
                id = row['uniqkey']
                k = 'tag_ids'
                if k not in meta[id]:
                    meta[id][k] = []
                meta[id][k].append(row['id'])
        db.session.commit()
        db.session.close()
        db.session.remove()
コード例 #13
0
ファイル: ingest.py プロジェクト: newslynx/newslynx-core
    def _tags():
        tag_query = """
        SELECT '{0}' AS uniqkey, id FROM tags
        WHERE (slug in ({1}) or id in ({2}))
        AND org_id = {3} AND type='subject'
        """
        queries = []
        for uniqkey, vals in meta.iteritems():

            # separate slugs and ids.
            tags = uniq(meta[uniqkey].pop('tag_ids', []))
            ids = []
            slugs = []
            for t in tags:
                try:
                    ids.append(int(t))
                except ValueError:
                    slugs.append(t)

            # format queries.
            slugs = ",".join(["'%s'" % s for s in slugs])
            ids = ",".join([str(i) for i in ids])
            if len(slugs) or len(ids):
                if not slugs:
                    slugs = "'__null___'"
                if not ids:
                    ids = '-99999'
                queries.append(tag_query.format(uniqkey, slugs, ids, org_id))

        # execute query + modify meta.
        if len(queries):
            q = "\nUNION ALL\n".join(queries)
            for row in ResultIter(db.session.execute(q)):
                id = row['uniqkey']
                k = 'tag_ids'
                if k not in meta[id]:
                    meta[id][k] = []
                meta[id][k].append(row['id'])
        db.session.commit()
        db.session.close()
        db.session.remove()
コード例 #14
0
ファイル: url.py プロジェクト: jjelosua/newslynx-core
def from_string(string, **kw):
    """
    get urls from input string
    """

    source = kw.get('source', None)
    exclude_images = kw.get('excl_img', True)

    if not string:
        return []

    raw_urls = re_url.findall(string)
    short_urls = [g[0].strip() for g in re_short_url_text.findall(string)]

    urls = []
    if source:
        for url in raw_urls:
            if not is_abs(url):
                url = urljoin(source, url)
            urls.append(url)
    else:
        urls = [u for u in raw_urls if is_valid(u)]

    # make sure short url regex doesn't create partial dupes.
    for u in short_urls:
        if any([u in r for r in urls]):
            short_urls.remove(u)

    # combine
    urls += short_urls

    # remove images.
    if exclude_images:
        urls = [u for u in urls if not is_image(u)]

    # remove invalid urls
    urls = [u for u in urls if is_valid(u)]

    return uniq(urls)
コード例 #15
0
ファイル: url.py プロジェクト: jjelosua/newslynx-core
def from_string(string, **kw):
    """
    get urls from input string
    """

    source = kw.get('source', None)
    exclude_images = kw.get('excl_img', True)

    if not string:
        return []

    raw_urls = re_url.findall(string)
    short_urls = [g[0].strip() for g in re_short_url_text.findall(string)]

    urls = []
    if source:
        for url in raw_urls:
            if not is_abs(url):
                url = urljoin(source, url)
            urls.append(url)
    else:
        urls = [u for u in raw_urls if is_valid(u)]

    # make sure short url regex doesn't create partial dupes.
    for u in short_urls:
        if any([u in r for r in urls]):
            short_urls.remove(u)

    # combine
    urls += short_urls

    # remove images.
    if exclude_images:
        urls = [u for u in urls if not is_image(u)]

    # remove invalid urls
    urls = [u for u in urls if is_valid(u)]

    return uniq(urls)
コード例 #16
0
ファイル: url.py プロジェクト: jjelosua/newslynx-core
def from_html(htmlstring, **kw):
    """
    Extract urls from htmlstring, optionally reconciling
    relative urls + embeds + redirects.
    """
    source = kw.get('source', None)
    exclude_images = kw.get('excl_img', True)

    if not htmlstring:
        return []
    final_urls = []
    if source:
        source_domain = get_domain(source)
    soup = BeautifulSoup(htmlstring)
    for tag in URL_TAGS:

        for el in soup.find_all(tag):

            for attr in URL_ATTRS:
                href = el.attrs.get(attr, None)

                if not href:
                    continue
                url = reconcile_embed(href)

                if source:
                    url = redirect_back(url, source_domain)
                    if not is_abs(url):
                        url = urljoin(source, url)

                if not is_valid(url):
                    continue
                if exclude_images:
                    if not is_image(url):
                        final_urls.append(url)
                else:
                    final_urls.append(url)
    return uniq(final_urls)
コード例 #17
0
ファイル: url.py プロジェクト: jjelosua/newslynx-core
def from_html(htmlstring, **kw):
    """
    Extract urls from htmlstring, optionally reconciling
    relative urls + embeds + redirects.
    """
    source = kw.get('source', None)
    exclude_images = kw.get('excl_img', True)

    if not htmlstring:
        return []
    final_urls = []
    if source:
        source_domain = get_domain(source)
    soup = BeautifulSoup(htmlstring)
    for tag in URL_TAGS:

        for el in soup.find_all(tag):

            for attr in URL_ATTRS:
                href = el.attrs.get(attr, None)

                if not href:
                    continue
                url = reconcile_embed(href)

                if source:
                    url = redirect_back(url, source_domain)
                    if not is_abs(url):
                        url = urljoin(source, url)

                if not is_valid(url):
                    continue
                if exclude_images:
                    if not is_image(url):
                        final_urls.append(url)
                else:
                    final_urls.append(url)
    return uniq(final_urls)
コード例 #18
0
ファイル: ingest.py プロジェクト: abelsonlive/newslynx-core
    def _authors():
        author_query = """
        SELECT '{0}' AS uniqkey, id FROM authors
        WHERE (name in ({1}) or id in ({2}))
        AND org_id = {3}
        """
        queries = []
        for uniqkey, vals in meta.iteritems():

            # separate slugs and ids.
            authors = meta[uniqkey].get('author_ids', [])
            ids = []
            names = []
            for a in authors:
                try:
                    ids.append(int(a))
                except ValueError:
                    names.append(a.upper().strip())

            names = ",".join(["'%s'" % n for n in uniq(names)])
            ids = ",".join([str(i) for i in uniq(ids)])
            if names or ids:
                if not names:
                    names = "'__null___'"
                if not ids:
                    ids = '-99999'
                queries.append(
                    author_query.format(uniqkey, names, ids, org_id))

        # execute query + modify meta.
        if len(queries):
            q = "\nUNION ALL\n".join(queries)
            for row in ResultIter(db.session.execute(q)):
                id = row['uniqkey']
                k = 'author_ids'
                if k in meta[id]:
                    meta[id][k] = []
                meta[id][k].append(row['id'])
                meta[id]['authors_exist'] = True

        # check for authors we should create.
        to_create = []
        for uniqkey, item in meta.iteritems():
            if item.get('authors_exist', False):
                continue
            for a in meta[uniqkey].pop('author_ids', []):
                if not isinstance(a, (basestring, str, unicode)):
                    continue
                to_create.append((uniqkey, org_id, a))

        # if we should create them, do so.
        if len(to_create):
            # create authors + keep track of content relations
            authors_to_ids = dict()
            seen = set()
            for uniqkey, oid, name in to_create:
                name = name.upper().strip()
                if name not in seen and name.lower().strip() not in author.BAD_TOKENS:
                    authors_to_ids[name] = {}
                    seen.add(name.upper().strip())
                    a = Author(org_id=oid, name=name)
                    db.session.add(a)
                    authors_to_ids[name]['obj'] = a

                # keep track of ALL ids assoicated with this author.
                if name in authors_to_ids:
                    if not 'ids' in authors_to_ids[name]:
                        authors_to_ids[name]['ids'] = []
                    authors_to_ids[name]['ids'].append(uniqkey)

            # create new authors so we
            # can access their IDs.
            db.session.commit()

            # set author ids back on content item meta
            for name, values in authors_to_ids.iteritems():
                ids = values.get('ids', [])
                obj = values.get('obj')
                k = 'author_ids'
                for uniqkey in ids:
                    if k not in meta[uniqkey]:
                        meta[uniqkey][k] = []
                    meta[uniqkey][k].append(obj.id)
        db.session.close()
        db.session.remove()
コード例 #19
0
ファイル: search.py プロジェクト: lexifdev/newslynx-core
def tokenizer(text, n):
    """
    Tokenize unique ngrams.
    """
    grams = ngrams(text, n)
    return uniq([" ".join(gram).decode('utf-8') for gram in grams])
コード例 #20
0
def required_metrics(f):
    """
    What metrics does this formula require?
    """
    return uniq(re_formula_metric_names.findall(f))
コード例 #21
0
ファイル: rss.py プロジェクト: eads/newslynx-core
 def get_tags(self, entry):
     """
     Get all tags.
     """
     tags = self.get_candidates(entry, TAG_CANDIDATE_JSONPATH)
     return uniq([t.upper() for t in tags if t and t.strip() != ""])
コード例 #22
0
def required_metrics(f):
    """
    What metrics does this formula require?
    """
    return uniq(re_formula_metric_names.findall(f))
コード例 #23
0
ファイル: search.py プロジェクト: abelsonlive/newslynx-core
def tokenizer(text, n):
    """
    Tokenize unique ngrams.
    """
    grams = ngrams(text, n)
    return uniq([" ".join(gram).decode("utf-8") for gram in grams])
コード例 #24
0
 def get_tags(self, entry):
     """
     Get all tags.
     """
     tags = self.get_candidates(entry, TAG_CANDIDATE_JSONPATH)
     return uniq([t.upper() for t in tags if t and t.strip() != ""])
コード例 #25
0
ファイル: ingest.py プロジェクト: newslynx/newslynx-core
    def _authors():
        author_query = """
        SELECT '{0}' AS uniqkey, id FROM authors
        WHERE (name in ({1}) or id in ({2}))
        AND org_id = {3}
        """
        queries = []
        for uniqkey, vals in meta.iteritems():

            # separate slugs and ids.
            authors = meta[uniqkey].get('author_ids', [])
            ids = []
            names = []
            for a in authors:
                try:
                    ids.append(int(a))
                except ValueError:
                    names.append(a.upper().strip())

            names = ",".join(["'%s'" % n for n in uniq(names)])
            ids = ",".join([str(i) for i in uniq(ids)])
            if names or ids:
                if not names:
                    names = "'__null___'"
                if not ids:
                    ids = '-99999'
                queries.append(author_query.format(uniqkey, names, ids,
                                                   org_id))

        # execute query + modify meta.
        if len(queries):
            q = "\nUNION ALL\n".join(queries)
            for row in ResultIter(db.session.execute(q)):
                id = row['uniqkey']
                k = 'author_ids'
                if k in meta[id]:
                    meta[id][k] = []
                meta[id][k].append(row['id'])
                meta[id]['authors_exist'] = True

        # check for authors we should create.
        to_create = []
        for uniqkey, item in meta.iteritems():
            if item.get('authors_exist', False):
                continue
            for a in meta[uniqkey].pop('author_ids', []):
                if not isinstance(a, (basestring, str, unicode)):
                    continue
                to_create.append((uniqkey, org_id, a))

        # if we should create them, do so.
        if len(to_create):
            # create authors + keep track of content relations
            authors_to_ids = dict()
            seen = set()
            for uniqkey, oid, name in to_create:
                name = name.upper().strip()
                if name not in seen and name.lower().strip(
                ) not in author.BAD_TOKENS:
                    authors_to_ids[name] = {}
                    seen.add(name.upper().strip())
                    a = Author(org_id=oid, name=name)
                    db.session.add(a)
                    authors_to_ids[name]['obj'] = a

                # keep track of ALL ids assoicated with this author.
                if name in authors_to_ids:
                    if not 'ids' in authors_to_ids[name]:
                        authors_to_ids[name]['ids'] = []
                    authors_to_ids[name]['ids'].append(uniqkey)

            # create new authors so we
            # can access their IDs.
            db.session.commit()

            # set author ids back on content item meta
            for name, values in authors_to_ids.iteritems():
                ids = values.get('ids', [])
                obj = values.get('obj')
                k = 'author_ids'
                for uniqkey in ids:
                    if k not in meta[uniqkey]:
                        meta[uniqkey][k] = []
                    meta[uniqkey][k].append(obj.id)
        db.session.close()
        db.session.remove()
コード例 #26
0
 def links(self):
     return uniq([u['expanded_url'] for u in self._entities.get('urls', [])])
コード例 #27
0
 def img_url(self):
     media = uniq([h['media_url'] for h in self._entities.get('media', [])])
     if len(media):
         return media[0]
     return self._user.get('profile_image_url', None)