Beispiel #1
0
def metadata(*urns):
    result = query(fq=['dpaId:"%s"' % urn for urn in urns],
                   wt="json",
                   omitHeaders=True,
                   q='*',
                   fl='rfc4180,dpaId,dpaTitle')
    return (result["response"]["docs"])
Beispiel #2
0
def newest(sourceId="dpa"):
    result = query(fq=[
        "createdAt:[NOW/HOUR-1HOUR TO NOW/HOUR+1HOUR]",
        'sourceId:"%s"' % sourceId,
    ],
                   wt="json",
                   omitHeaders=True,
                   q='*',
                   rows=100,
                   sort="createdAt desc",
                   fl='createdAt,dpaId')
    return (result["response"]["docs"])
Beispiel #3
0
# coding: utf-8

# In[60]:

from neofonie import query
from collections import defaultdict

unknown = query(
    '*&wt=json&fq=createdAt:[2016-07-22T15:44:39.000Z%20TO%202016-07-23T04:44:39.000Z]&fq=sourceId:"twitter"&facet.mincount=1&fq=labels:"M%C3%BCnchen"&facet=true&facet.field=unknownPersonsSurfaceforms&facet.limit=200&facet.missing=true&f.unknownPersonsSurfaceforms.facet.sort=count&facet.method=enum'
)
persons = defaultdict(lambda: 0)

personsTable = unknown["facet_counts"]["facet_fields"][
    'unknownPersonsSurfaceforms']

for i in range(0, len(personsTable), 2):
    persons[str(personsTable[i])] += personsTable[i + 1]

# In[86]:

#exclude based on reguar expressions

import re

exclude = {
    a
    for a in persons.keys() if re.search(r"\bGmbh\b", a, re.I) or re.search(
        r"\b(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b",
        a, re.I) or re.search(r"[:\"]", a) or re.search(r" \. ", a)
}
Beispiel #4
0
    return open(f,*args)



for day in range(1,days) :
    date=startdate-datetime.timedelta(days=day)
    label=indexlabel.format(**locals())
    results=OrderedDict()
    indexfilename=indexfile.format(**locals())
    docs[label]=indexfilename
    dayquery=copy.deepcopy(basequery)
    dayquery["fq"].append(date.strftime('createdAt:[%Y-%m-%dT00:00:00.000Z TO %Y-%m-%dT23:59:59.999Z]'))
    for (k,n) in branchen.items() :
        nq=copy.deepcopy(dayquery)
        nq["fq"].append('sectors:"{0}"'.format(k))
        res=filter_response(deduplicate(query('*', **nq),attr="dpaId"),filterfunction=lambda a: True)
        logging.debug("Sector: %s - %s - %s docs" % (k,date.strftime("%Y-%m-%d"),len(res["response"]["docs"])))
        if len(res["response"]["docs"])>0 :
            for d in res["response"]["docs"] :
                for (old,new) in attributeNames.items() :
                    if old in d :
                        d[new]=copy.deepcopy(d[old])
                        del d[old]

            results[k]=res
            results[k]["label"]=n


    for nr in results.values() :
        for doc in nr["response"]["docs"] :
            filename=docfile.format(**locals())e
def generate(config):
    texts = {}
    docs = OrderedDict()
    for day in range(0, config.days):
        date = config.startdate - datetime.timedelta(days=day)
        label = config.indexlabel.format(**locals())
        results = OrderedDict()
        indexfilename = config.indexfile.format(**locals())
        docs[label] = indexfilename
        dayquery = copy.deepcopy(config.basequery)
        dayquery["fq"].append(
            date.strftime(
                'createdAt:[%Y-%m-%dT00:00:00.000Z TO %Y-%m-%dT23:59:59.999Z]')
        )
        for (k, n) in config.branchen.items():
            nq = copy.deepcopy(dayquery)
            nq["fq"].append('+sectors:"{0}"'.format(k))
            res = list(
                neofonie.query("*", **nq)["response"]["docs"]
                | datapipeline.rename_attributes(config.rename)
                | pipe.where(config.filter)
                | datapipeline.deduplicate(key=lambda a: a["title"])
                | datapipeline.default_attributes(('sourcelink', 'source',
                                                   'subtitle'))
                | datapipeline.call(add_sectors_to_subtitle))
            logging.debug("Sector: %s - %s - %s docs" %
                          (k, date.strftime("%Y-%m-%d"), len(res)))
            for item in res:
                logging.debug(
                    "     %s %s %s" %
                    (item["sectors"], item["title"], item["text"][:30]))
            if len(res) > 0:
                results[k] = dict(docs=res, label=n)
        for nr in results.values():
            for doc in nr["docs"]:
                filename = config.docfile.format(**locals())
                doc["document"] = filename
                ndoc = copy.deepcopy(doc)
                ndoc["index"] = os.path.join("..", indexfilename)
                ndoc["sector"] = doc["sectors"][0]
                ndoc["root"] = os.path.join("..", config.rootfile)
                ndoc["source"] = "ex neoApplication"
                ndoc["sourcelink"] = "ex neoURL"
                ndoc["subtitle"] = "Untertitel zu {}".format(
                    ndoc.get("title", "---"))
                texts[os.path.join(config.directory, filename)] = ndoc
                if "text" in doc:
                    del (doc["text"])
        with mkdirs_and_open(os.path.join(config.directory, indexfilename),
                             "w") as of:
            json.dump(
                dict(news=results,
                     root=config.rootfile,
                     rootlabel=config.rootlabel), of)
            logging.info("%s items written to %s" %
                         (reduce(lambda a, b: a + b,
                                 (len(a["docs"])
                                  for a in results.values()), 0), of.name))

    for (k, v) in texts.items():
        json.dump(v, mkdirs_and_open(k, "w"))
    logging.debug("%s news objects written" % len(list(texts.keys())))
    t = copy.deepcopy(config.template)
    t["chapters"] = docs
    json.dump(t, open(os.path.join(config.directory, config.rootfile), "w"))
Beispiel #6
0
def neofonie_query(day1,day2,rows_batch):

    

    daytext1=str("createdAt:[")
    daytext2=str("T00:00:00.001Z TO ")
    daytext3=str("T23:59:59.999Z]")
    dayframe="".join([daytext1,day1,daytext2,day2,daytext3])
    

    counter=1
    start_position=0

    numFound="noch nicht bekannt"


    

    while numFound=="noch nicht bekannt" or start_position <= numFound:
        result=(query('*:*',
                    wt="json",
                    fq=[
                        "sourceId:dpa",
                        dayframe,
                        "-dpaRessort:rs",
                        "-dpaTitle:Tagesvorschau",
                        "-dpaTitle:Abendvorschau",
                        "-dpaTitle:Morgenvorschau",
                        "-dpaTitle:Terminvorschau",
                        "-dpaTitle:DAX",
                        "-dpaTitle:Ausgewählte Investmentfond",
                        "-dpaTitle:*Ausgewählte Auslandsaktien*",
                        "-dpaTitle:EuroStoxx",
                        "-dpaTitle:MDAX",
                        "-dpaTitle:TecDAX",
                        "-dpaTitle:Devisenkurse am",
                        "-dpaId:*dpa-afx*",
                        # "-text:-----------------------",
                        "-text:berichtet dpa heute wie folgt",
                        "-dpaTitle:DGAP-News"
                    ],
                    fl="createdAt,dpaTitle,dpaId,dpaRessort,dpaServices,text,dpaKeywords,dpaService,dpaservices,dpaservice",
                        # "dpaServices",
                        # "createdAt",
                        # "dpaId",
                    sort= "createdAt asc",   
                    start=start_position,
                    rows=rows_batch
                    )
                )
        print("\ndownloaded  batch\n")
        numFound=int(result["response"]["numFound"])
        amount_batches=numFound//rows_batch
        last_batch=numFound%rows_batch
        amount_batches=numFound//rows_batch
        last_batch=numFound%rows_batch
        print("\n Amount of articles:",numFound,"\n")
        docs=result["response"]["docs"]
    #For Loop
        for doc in docs :
            #print("Schleife fuer Datei {0} fuer Titel {dpaTitle}".format(filename,**d)) # d["dpaTitle"]
        

        # ##DPA ID as filename
        #     string_begin_temp =(doc["dpaId"])
        #     string_begin_temp = string_begin_temp.replace(":","_")
        #     string_begin=string_begin_temp.replace('/', 'v-')
        # #Writing the file               
        #     string_end=".json"
        #     filename="".join([string_begin,string_end])
        #     foldername=(doc["createdAt"])
        #     foldername=foldername[0:10]
        #     # filename="/Users/alex/python_project/outputs/DPA-Meldungen/{string_begin}.json".format(**locals())
        #     # #
        #     # verzeichnisname aus createdAt
        #     # os.makedirectory ???
        #     #
        #     # try / except 
        #     #
        #     file_path = "".join(["/Users/alex/python_project/outputs/DPA-Meldungen/",foldername,"/"])
        #     try: 
        #         os.makedirs(file_path)
        #     except OSError:
        #         if not os.path.isdir(file_path):
        #             raise  

        #     with open(file_path+filename, 'w') as f:    
        #         json.dump(doc,f)
        #         f.close()
        #         print("\nSaved:", filename,"\n","Article Number",counter)

            print (doc)
            insert_dic={
                "dpaId":doc["dpaId"],
                "text":doc["text"],
                "createdAt":doc["createdAt"],
                "dpaTitle":doc["dpaTitle"],
                "dpaRessort":doc["dpaRessort"]
                #"dpaServices":doc["dpaServices"]
            }
            articles.insert(insert_dic)
            counter=counter+1
        #Moving the file
        #old_position="".join(["/Users/alex/python_project/",filename])
        #new_position="".join(["/Users/alex/python_project/outputs/DPA-Meldungen/",filename])
        #os.rename(old_position, new_position)
        #print("\nMoved file",filename)  
        
        if start_position <= amount_batches*rows_batch:
            start_position=start_position+rows_batch
        else:
            start_position=start_position+rows_batch
            rows_batch=last_batch
        

    print("\n\n**FINISHED**")
    print (doc)