Beispiel #1
0
def test_array():
    inp = '<baz><foo>bar</foo><foo>bar2</foo></baz>'
    mp = [{'column': 'foo', 'path': 'foo', 'array': True}]
    et = lxml.etree.fromstring(inp)
    assert extract(et, mp) == {'foo': ['bar', 'bar2']}

    with pytest.raises(ValueError):
        del mp[0]['array']
        extract(et, mp)
Beispiel #2
0
def test_atts():
    inp = '<baz><foo a="b" c="d">abc</foo></baz>'
    mp = [{'column': 'foo', 'path': 'foo', 'attrs': {'a': 'b', 'c': 'd'}}]
    et = lxml.etree.fromstring(inp)
    assert extract(et, mp) == {'foo': 'abc'}

    with pytest.raises(KeyError):
        mp[0]['attrs'] = {'b': 'c'}
        et = lxml.etree.fromstring(inp)
        extract(et, mp)
Beispiel #3
0
def test_extract(spark_session: SparkSession) -> None:
    in_path = "tests/unit/some_text_file.txt"
    expected_df = spark_session.createDataFrame([
        "hello world world",
        "hello world",
        "",
        "test",
    ],
                                                schema=StringType())
    df = main.extract(spark_session, in_path)
    assert df.collect() == expected_df.collect()
Beispiel #4
0
def test_nested_and_array():
    inp = '<baz><foo><bak>a1</bak><bak>a2</bak></foo></baz>'
    mp = [{
        'column': 'foo',
        'path': 'foo',
        'mapping': [
            {
                'column': 'bak',
                'path': 'bak',
                'array': True
            },
        ]
    }]
    et = lxml.etree.fromstring(inp)
    assert extract(et, mp) == {'foo': {'bak': ['a1', 'a2']}}
Beispiel #5
0
def test_attr_and_nested():
    inp = '<baz><foo prop="abc">bar</foo></baz>'
    mp = [{
        'column':
        'bak',
        'path':
        'foo',
        'mapping': [
            {
                'column': 'nested',
                'path': None
            },
            {
                'column': 'nested2',
                'path': '#prop'
            },
        ]
    }]
    et = lxml.etree.fromstring(inp)
    assert extract(et, mp) == {'bak': {'nested': 'bar', 'nested2': 'abc'}}
Beispiel #6
0
def UploadManager(filename):
    #blob = BlobManager(config_blob.BLOB_NAME,config_blob.BLOB_KEY)

    if get_name_of_filepath(filename)!='NOT A WORD DOCUMENT':
        print(filename)
        #blob.upload_from_bytes(data,filename,config_blob.BLOB_CONTAINER)
        #print('blob connection')
        #directory_output=os.path.split(output_directorypath)[1]
        #print(directory_output)
        #root_output=os.path.split(output_directorypath)[0]
        #print(root_output)
        #blob.download(output_directorypath,filename,directory_output, config_blob.BLOB_CONTAINER)
        #blob.download_all_blobs(root_output,directory_output,config_blob.BLOB_CONTAINER)
        
        main.blob_download()
        
        logging.getLogger().setLevel(logging.INFO)
        
        # Extract data from upstream.
        observations = main.extract()

        # Spacy: Spacy NLP
        nlp = spacy.load('en')
    
        # Transform data to have appropriate fields
        observations, nlp = main.transform(observations, nlp)
    
        # Load data for downstream consumption
        main.load(observations, nlp)
        main.load_to_json(observations, nlp)
        
        #Send data to CosmosDB
        main.send_to_Cosmos()
        print('sent to cosmos')
        print("Data is ready to be send to cosmos")
        
        #Create Azure search Datasource, Index and indexer
        main.implement_Azure_search()
    else: 
        print('error')
        return 'NOT A WORD DOCUMENT'
Beispiel #7
0
def UploadManager_path(filepath, output_directorypath):
    blob = BlobManager(config_blob.BLOB_NAME,config_blob.BLOB_KEY)

    if get_name_of_filepath(filepath)!='NOT A WORD DOCUMENT':
        print(filepath)
        path=os.path.split(get_name_of_filepath(filepath))[0]
        filename=os.path.split(get_name_of_filepath(filepath))[1]
        blob.upload(path,filename,config_blob.BLOB_CONTAINER)
        print('blob connection')
        directory_output=os.path.split(output_directorypath)[1]
        print(directory_output)
        root_output=os.path.split(output_directorypath)[0]
        print(root_output)
        #blob.download(output_directorypath,filename,directory_output, config_blob.BLOB_CONTAINER)
        blob.download_all_blobs(root_output,directory_output,config_blob.BLOB_CONTAINER)
        
        
        logging.getLogger().setLevel(logging.INFO)
        
        # Extract data from upstream.
        observations = main.extract()

        # Spacy: Spacy NLP
        nlp = spacy.load('en')
    
        # Transform data to have appropriate fields
        observations, nlp = main.transform(observations, nlp)
    
        # Load data for downstream consumption
        main.load(observations, nlp)
        main.load_to_json(observations, nlp)
        
        #Send data to CosmosDB
        send_to_Cosmos()
        print('sent to cosmos')
    else: 
        print('error' + filepath)
        return 'NOT A WORD DOCUMENT'
def main(audiofile, output):
    fs = [audiofile]
    highlightList = list(
        extract(fs,
                length=30,
                save_score=False,
                save_thumbnail=False,
                save_wav=False))
    begin, end = highlightList[0]
    dur = librosa.get_duration(filename=audiofile)
    intervals = [
        (0, begin),
        (begin, end),
        (end, dur),
    ]
    labels = [
        "others",
        "chorus",
        "others",
    ]

    contents = np.array([(x[0], x[1], y) for x, y in zip(intervals, labels)],
                        np.dtype("f, f, U16"))
    np.savetxt(output, contents, fmt=["%.2f", "%.2f", "%s"], delimiter="\t")
Beispiel #9
0
def test_basic():
    inp = '<baz><foo>bar</foo></baz>'
    mp = [{'column': 'foo', 'path': 'foo'}]
    et = lxml.etree.fromstring(inp)
    assert extract(et, mp) == {'foo': 'bar'}
Beispiel #10
0
def test_empty():
    inp = '<baz><foo>bar</foo></baz>'
    mp = [{'column': 'bak', 'path': 'bak'}]
    et = lxml.etree.fromstring(inp)
    assert extract(et, mp) == {'bak': None}
Beispiel #11
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Extract accounts\' identifiers from pages.')
    parser.add_argument('url', help='url to parse')
    parser.add_argument('--cookies', default='', help='cookies to make http requests with auth')
    parser.add_argument('--debug', action='store_true', help='log debug information')
    parser.add_argument('--file', action='store_true', help='load from file instead of URL')

    args = parser.parse_args()

    log_level = logging.INFO if not args.debug else logging.DEBUG

    logging.basicConfig(level=log_level, format='-'*40 + '\n%(levelname)s: %(message)s')

    if not args.file:
        url = args.url
        page, status = parse(url, args.cookies)

        if status != 200:
            logging.info('Answer code {}, something went wrong'.format(status))
    else:
        page = open(args.url).read()

    info = extract(page)
    if not info:
        sys.exit()

    logging.info('Result\n' + '-'*40)
    for key, value in info.items():
        print('%s: %s' % (key, value))