Example #1
0
def get_not_processed_files(options):
    files = []

    filter = ~Q(file_url=Range(ANY, ANY)) | Q(status=FILE_STATUS_NOT_PROCESSED)

    if options.get("include_companies", None):
        filter = Q(ccvm=" ".join(
            options.get("include_companies", []))) & (filter)

    _logger.debug("Loading from database the files to be crawled...")
    paginator = CaravaggioSearchPaginator(
        query_string=str(filter),
        limit=1000, max_limit=1000).\
        models(BovespaCompanyFile).\
        select("ccvm", "doc_type", "fiscal_date", "version")

    while paginator.has_next():
        _logger.debug(
            "{0}/{1} files loaded from database...".
            format(paginator.get_loaded_docs(), paginator.get_hits()))
        paginator.next()
        files.extend([(d.ccvm, d.doc_type, d.fiscal_date, d.version)
                      for d in paginator.get_results()])

    # return [file for file in
    #        CaravaggioSearchQuerySet().models(BovespaCompanyFile).
    #            raw_search(str(filter)).
    #            values_list("ccvm", "doc_type", "fiscal_date", "version")]
    return files
Example #2
0
def load_accounts(valid_account_types):
    key = "command:gen_accountability_plan:load_accounts"

    # Get all the company accounts for the given
    # company and fiscal_date
    filter = Q(value=Range(0, "*", safe=True, boundaries="exclusive"))
    balance_type_filter = None
    for balance_type in valid_account_types:
        if balance_type_filter is None:
            balance_type_filter = Q(balance_type=balance_type)
        else:
            balance_type_filter |= Q(balance_type=balance_type)

    filter = filter & (balance_type_filter)

    paginator = CaravaggioSearchPaginator(
        str(filter),
        limit=5000,
        **{"group": "true", "group.field": "number_exact", "group.limit": 1},
        useFieldCache=True,
    ).models(BovespaAccount)

    accounts = {}
    while paginator.has_next():
        _logger.debug("{} accounts loaded from database...".format(paginator.get_loaded_docs()))
        paginator.next()
        for acc_number, details in paginator.get_results().items():
            accounts[acc_number] = (details[0].balance_type, details[0].name)

    return accounts
def get_not_processed_files(options, producer):
    filter = ~Q(file_url=Range(ANY, ANY)) | Q(status=FILE_STATUS_NOT_PROCESSED)

    if options.get("include_companies", None):
        filter = Q(ccvm=Value("({})".format(" ".join(
            options.get("include_companies", []))),
                              safe=True)) & (filter)

    _logger.debug("Loading from database the files to be crawled...")
    paginator = (CaravaggioSearchPaginator(
        query_string=str(filter), limit=1000,
        max_limit=1000).models(BovespaCompanyFile).select(
            "ccvm", "doc_type", "fiscal_date", "version"))

    while paginator.has_next():
        _logger.debug("{0}/{1} files loaded from database...".format(
            paginator.get_loaded_docs(), paginator.get_hits()))
        paginator.next()
        for d in paginator.get_results():
            params = {
                "ccvm": d.ccvm,
                "doc_type": d.doc_type,
                "fiscal_date": d.fiscal_date,
                "version": d.version
            }
            producer.add_crawl_params(params, options)
Example #4
0
    def where(self, q, **kwargs):
        """Adds a conjunctive filter to a query.

        :param q: string or `solrq.Q` object
        :param kwargs: Arguments to construct a `solrq.Q` with
        :return: QueryBuilder object
        :rtype: :py:class:`QueryBuilder`
        """
        if isinstance(q, string_types):
            if self._raw_query is None:
                self._raw_query = []
            self._raw_query.append(q)
        elif isinstance(q, Q) or kwargs:
            if self._query is not None:
                raise ApiError(
                    "Use .and_() or .or_() for an extant solrq.Q object")
            if kwargs:
                self._process_guid = self._process_guid or kwargs.get(
                    "process_guid")
                q = Q(**kwargs)
            self._query = q
        else:
            raise ApiError(".where() only accepts strings or solrq.Q objects")

        return self
Example #5
0
 def __init__(self, **kwargs):
     if kwargs:
         self._query = Q(**kwargs)
     else:
         self._query = None
     self._raw_query = None
     self._process_guid = None
Example #6
0
def load_accounts(ccvm, period):
    key = "{ccvm}_{period:%Y%m%d}".format(ccvm=ccvm, period=period)
    period_data = accounts_data_cache.get(key, None)

    if not period_data:
        period_data = {}
        # Get all the company accounts for the given
        # company and fiscal_date
        filter = Q(period=period) & Q(ccvm=ccvm)

        _logger.debug(
            "Loading from database the accounts for {} - {}...".format(
                ccvm, str(filter)))
        paginator = (CaravaggioSearchPaginator(
            query_string=str(filter),
            sort="version_exact asc, number_exact asc",
            limit=5000,
            max_limit=5000).models(BovespaAccount).select(
                "version", "number", "name", "financial_info_type",
                "balance_type", "comments", "amount"))

        while paginator.has_next():
            _logger.debug("{0}/{1} accounts loaded from database...".format(
                paginator.get_loaded_docs(), paginator.get_hits()))
            paginator.next()
            for d in paginator.get_results():
                _logger.debug("Raw Account: {}".format(d))
                balance_type_accounts = period_data.setdefault(
                    d.balance_type, {})
                financial_type_accounts = balance_type_accounts.setdefault(
                    d.financial_info_type, {})

                financial_type_accounts[d.number] = {
                    "number": d.number,
                    "name": d.name,
                    "comments": d.comments,
                    "financial_info_type": d.financial_info_type,
                    "balance_type": d.balance_type,
                    "amount": float(d.amount),
                }
        accounts_data_cache[key] = period_data

    return period_data
Example #7
0
def buildQuery():
    """
    faculteit kan meerdere keren voorkomen
    vandaar de dict.getlist
    zie: https://werkzeug.palletsprojects.com/en/1.0.x/datastructures/#werkzeug.datastructures.MultiDict
    NB nu ff niet geimplementeerd
    dit werkt niet: return Q(text="amsterdam", type="master", faculteit="FEB", faculteit="FMG")
    dit werkt wel:  return Q(text="amsterdam", type="master", faculteit="FEB") & Q(faculteit="FMG")
    """
    dict = request.args
    dict.getlist('faculteit')
    return Q(**dict)
Example #8
0
def has_files_to_be_processed(ccvm):
    filter = ~Q(file_url=Range(ANY, ANY)) | Q(status=FILE_STATUS_NOT_PROCESSED)
    filter = Q(ccvm=ccvm) & (filter)

    _logger.debug("Loading from database the files to be crawled...")
    paginator = (CaravaggioSearchPaginator(
        query_string=str(filter), limit=1,
        max_limit=1).models(BovespaCompanyFile).select("ccvm", "doc_type",
                                                       "fiscal_date",
                                                       "version"))

    if paginator.has_next():
        paginator.next()
        if paginator.get_hits() > 0:
            _logger.debug(
                "Company {0} HAS {1} FILES PENDING to be processed...".format(
                    ccvm, paginator.get_hits()))
            return True

    _logger.debug(
        "Company {0} has not files pending to be processed...".format(ccvm))
    return False
Example #9
0
def specialisedDeeperSearch(queryIndex):
    results = []

    # print('\n SPECIALIZED DEEPER SEARCH')

    # Deeper NLP Pipeline search
    for i in range(len(queryIndex)):
        text = queryIndex[i]['text']
        tokens = queryIndex[i]['tokens']
        stems = queryIndex[i]['stem']
        lemmas = queryIndex[i]['lemma']
        posTags = queryIndex[i]['posTag']
        nounPhrases = queryIndex[i]['nounPhrases']
        hypernyms = queryIndex[i]['hypernym']
        hyponyms = queryIndex[i]['hyponym']
        meronyms = queryIndex[i]['meronym']
        holonyms = queryIndex[i]['holonym']

        results.append(
            solr2.search(
                (Q(text=text) ^ 0.5) & (Q(stem=stems) ^ 0.5) &
                (Q(lemma=lemmas) ^ 4) & (Q(posTag=posTags) ^ 0.02) &
                (Q(nounPhrases=nounPhrases) ^ 5) & (Q(hypernym=hypernyms) ^ 2)
                & (Q(hyponym=hyponyms) ^ 0.5) & (Q(meronym=meronyms) ^ 0.5) &
                (Q(holonym=holonyms) ^ 0.5),
                sort='score desc',
                score=True,
                fl='*,score',
                rows=1))

    for i in range(len(results)):
        # print('\n-----------------------------------------------------------------------------------------------\n')
        # print("Saw {0} result(s).".format(len(results[i])), '\n')
        # print('Input sentence', i + 1, ': ', input[i], '\n')
        for result in results[i]:
            # print("The ID is '{0}'.".format(result['id']))
            # print("The Sentence is '{0}'.".format(result['text']))
            # print("The Score is '{0}'.".format(result['score']))
            # print('\n')
            print(result['text'])
            return result['text']
Example #10
0
    def not_(self, q, **kwargs):
        """Adds a negative filter to a query.

        :param q: `solrq.Q` object
        :param kwargs: Arguments to construct a `solrq.Q` with
        :return: QueryBuilder object
        :rtype: :py:class:`QueryBuilder`
        """
        if kwargs:
            q = ~Q(**kwargs)

        if isinstance(q, Q):
            if self._query is None:
                self._query = q
            else:
                self._query = self._query & q
        else:
            raise ApiError(".not_() only accepts solrq.Q objects")
Example #11
0
def deeperSearch(queryIndex):
    results = []

    print('\n DEEPER SEARCH')

    # Deeper NLP Pipeline search
    for i in range(len(queryIndex)):
        tokens = queryIndex[i]['tokens']
        stems = queryIndex[i]['stem']
        lemmas = queryIndex[i]['lemma']
        posTags = queryIndex[i]['posTag']
        nounPhrases = queryIndex[i]['nounPhrases']
        hypernyms = queryIndex[i]['hypernym']
        hyponyms = queryIndex[i]['hyponym']
        meronyms = queryIndex[i]['meronym']
        holonyms = queryIndex[i]['holonym']

        results.append(
            solr2.search(Q(tokens=tokens) & Q(stem=stems) & Q(lemma=lemmas)
                         & Q(posTag=posTags) & Q(nounPhrases=nounPhrases)
                         & Q(hypernym=hypernyms) & Q(hyponym=hyponyms)
                         & Q(meronym=meronyms) & Q(holonym=holonyms),
                         sort='score desc',
                         score=True,
                         fl='*,score'))

    for i in range(len(results)):
        print(
            '\n-----------------------------------------------------------------------------------------------\n'
        )
        print("Saw {0} result(s).".format(len(results[i])), '\n')
        print('Input sentence', i + 1, ': ', input[i], '\n')
        for result in results[i]:
            print("The ID is '{0}'.".format(result['id']))
            print("The Sentence is '{0}'.".format(result['text']))
            print("The Score is '{0}'.".format(result['score']))
            print('\n')
Example #12
0
    def and_(self, q, **kwargs):
        """Adds a conjunctive filter to a query.

        :param q: string or `solrq.Q` object
        :param kwargs: Arguments to construct a `solrq.Q` with
        :return: QueryBuilder object
        :rtype: :py:class:`QueryBuilder`
        """
        if isinstance(q, string_types):
            self.where(q)
        elif isinstance(q, Q) or kwargs:
            if kwargs:
                q = Q(**kwargs)
            if self._query is None:
                self._query = q
            else:
                self._query = self._query & q
        else:
            raise ApiError(".and_() only accepts strings or solrq.Q objects")

        return self
Example #13
0
    def or_(self, q, **kwargs):
        """Adds a disjunctive filter to a query.

        :param q: `solrq.Q` object
        :param kwargs: Arguments to construct a `solrq.Q` with
        :return: QueryBuilder object
        :rtype: :py:class:`QueryBuilder`
        """
        if kwargs:
            self._process_guid = self._process_guid or kwargs.get("process_guid")
            q = Q(**kwargs)

        if isinstance(q, Q):
            if self._query is None:
                self._query = q
            else:
                self._query = self._query | q
        else:
            raise ApiError(".or_() only accepts solrq.Q objects")

        return self
Example #14
0
def field():
    rows = int(request.json.get('rows'))

    word_similar = request.json.get('word_similar')

    topic = request.json.get('topic')
    title = request.json.get('title')
    description = request.json.get('description')
    content = request.json.get('content')
    author = request.json.get('author')
    publish_date = request.json.get('publish_date')

    # tokenizer and word similar
    topic = ViTokenizer.tokenize(topic.strip()) if topic else ''
    author = author.strip().replace(' ', '_') if (author
                                                  and author.strip()) else ''
    publish_date = publish_date.strip() if publish_date else ''

    if word_similar == True:
        title = ws.find_word_similar(title.strip()) if title else ""
        description = ws.find_word_similar(
            description.strip()) if description else ""
        content = ws.find_word_similar(content.strip()) if content else ""
    else:
        title = ViTokenizer.tokenize(title.strip()) if title else ''
        description = ViTokenizer.tokenize(
            description.strip()) if description else ''
        content = ViTokenizer.tokenize(content.strip()) if content else ''

    # convert to solrQ
    if topic == '':
        topic_q = Q(topic="*")
    else:
        topic_q = Q(topic=topic)

    if title == '':
        title_q = Q(title="*")
    else:
        title_q = Q(title=title)

    if description == '':
        description_q = Q(description="*")
    else:
        description_q = Q(description=description)

    if content == '':
        content_q = Q(content="*")
    else:
        content_q = Q(content=content)

    if author == '':
        author_q = Q(author="*")
    else:
        author_q = Q(author=author)

    if publish_date == '':
        publish_date_q = Q(publish_date="*")
    else:
        publish_date_q = Q(publish_date=publish_date)

    query = topic_q & title_q & author_q & description_q & content_q & publish_date_q

    query_q = str(query).replace('\\', '').replace('(', '').replace(')', '')
    print(query_q)
    result = solr.search(
        query_q,
        **{
            'rows':
            rows,
            'hl':
            'true',
            'hl.method':
            'original',
            'hl.simple.pre':
            '<mark style="background-color:#ffff0070;">',
            'hl.simple.post':
            '</mark>',
            'hl.highlightMultiTerm':
            'true',
            'hl.fragsize':
            100,
            'defType':
            'edismax',
            'fl':
            '*, score',
            # 'bq': '{!func}linear(clicked, 0.01 ,0.0 )',
            'mm':
            1,
            'ps':
            3,
            'pf':
            'topic^1 title^1 content^1 author^1 description^1 publish_date^1',
            'qf':
            'topic^1 title^1 content^1 author^1 description^1 publish_date^1',
        })
    highlight = []
    for i in result.highlighting.values():
        highlight.append(i)

    # for i in highlight:
    #     print(i)

    return jsonify(results=list(result), hightlight=highlight)
Example #15
0
with open(stopWordsFile) as f:
    stopwords = f.read().splitlines()

cleanQuestionWords = []
for word in words:
    if word.lower() not in stopwords and word:
        cleanQuestionWords.append(word)

cleanWordsStr = ' '.join(cleanQuestionWords)
from solrq import Q, Proximity
import pysolr

solr = pysolr.Solr(solrURL)

#res = solr.search(cleanWordsStr.replace(' ','+'))
res = solr.search(Q(text=Proximity(cleanWordsStr, 10)))
#print len(res), ' results found'

# Just loop over it to access the results.

fout = open('wiki.result', 'w')

for r in res:
    #print("Reading wiki article : '{0}'.".format(r['title']))
    articleBody = r['text'].replace('[[','').replace(']]','')
    fout.write(remove_tags(articleBody))
    break
fout.close()

log = headword.lower()
Example #16
0
import pysolr
import json
from solrq import Q
from pyvi import ViTokenizer
# Setup a Solr instance. The timeout is optional.
solr = pysolr.Solr('http://localhost:8983/solr/bkcv',
                   always_commit=True,
                   timeout=100)

# # general_text = ''
title = ViTokenizer.tokenize("Ba ôtô dàn hàng ngang vượt đèn đỏ")
# print(title)
# query = Q(title="\"{}\"".format(title))^2 \
#         | Q(description="Hải Phòng Khi tín hiệu đèn đỏ còn ở giây")^1

query = Q(content="\"Nước Anh chính_thức phong_tỏa\"")
print(query)
results = solr.search(
    str(query).replace("\\", ""),
    **{
        'rows': 100000,
        'hl': 'true',
        'hl.method': 'original',
        'hl.simple.pre': '<mark style="background-color:#ffff0070;">',
        'hl.simple.post': '</mark>',
        'hl.highlightMultiTerm': 'true',
        'hl.fragsize': 100,
        'defType': 'edismax',
        'fl': '*, score',
        # 'bq':'{!func}linear(clicked, 0.01 ,0.0 )',
        # # 'bq':'{!func}log(linear(clicked, 20 ,0.0 ))',
Example #17
0
def get_documents_in_node(
    node: SophoraNode,
    *,
    document_type: Optional[str] = None,
    sort_field: Optional[str] = "modificationDate_dt",
    sort_order: Optional[str] = "desc",
    max_age: Union[dt.timedelta, dt.datetime, None] = None,
    force_exact: bool = False,
) -> Generator[Dict, None, None]:
    """Request all Sophora documents in a specific node.

    Args:
        node (SophoraNode): Sohopra node to request data for
        force_exact (bool, optional): If true, forces ``EXACT`` matching type instead of
            ``STARTS`` for the sophora node, even if ``node.use_exact_search`` is ``False``.
            Defaults to False.

    Yields:
        Generator[Dict, None, None]: The parsed JSON of individual Sophora documents as retrieved from the API.
    """
    node_str = node.node
    use_exact = force_exact or node.use_exact_search

    if not use_exact:
        node_str = node_str + "/"

    params = {
        "structureNodePath": node_str,
    }

    if document_type is not None:
        params["documentType"] = document_type

    if sort_field is not None:
        params["sortField"] = sort_field

    if sort_order is not None:
        params["sortOrder"] = sort_order

    if max_age is not None:
        if isinstance(max_age, dt.timedelta) and max_age.total_seconds() > 0:
            max_age = -max_age
        elif isinstance(max_age, dt.datetime):
            max_age = max_age.astimezone(UTC)

        # Encode filter query with solrq
        params["filterQueries"] = Q(
            modificationDate_dt=Range(max_age, dt.timedelta()))

    url = _sophora_api_url(
        "getDocumentsByStructureNodePath",
        "EXACT" if use_exact else "STARTS",
        "1",
        "20",
    )
    logger.info("Paging through URL {}", url)

    while True:
        response = requests.get(url, params=params)
        response.raise_for_status()
        logger.debug(response.request.url)

        response_data = response.json()
        yield from response_data["data"]

        if response_data["moreLink"] is None:
            break
        else:
            url = response_data["moreLink"]["moreUrl"]

            # Remove badly unescaped query from URL
            parsed = urlparse(url)
            url = parsed.copy_with(query=None, fragment=None).unsplit()