예제 #1
0
# -*- coding: utf-8 -*-

from SeaCOW import Query, Nonprocessor

# Create a Query object and set whatever needs to be set.
q = Query()
q.corpus = 'decow16a-nano'  # Lower-case name of the corpusto use.
q.string = '[word="Gartenzwerg"]'  # A normal CQL string as used in NoSketchEngine.
q.max_hits = -1  # Maximal number of hits to return. Ignored for Nonprocessor.
q.attributes = []  # For counting, you don't need word attributes.
q.structures = []  # ... you don't need structural attributes.
q.references = []  # ... you don't need reference attrs.
q.container = 's'  # Which container structure should be used?

# Using the deduplicator would NOT change the outcome. Switch off.
q.set_deduplication(off=True)

# Create a Processor object and attach it to the Query object.
# The Nonprocessor processor does nothing. You can work with the results
# yourself in the finalise method or just get the hits value from the
# query object. It is the concordance as seported by Manatee.
p = Nonprocessor()  # Create a processor object of apporpriate type.
q.processor = p  # Attach the processor to the query.
q.run()  # Run the query.

print('Query was: %s' % (q.string))
print('Corpus used: %s' % (q.corpus))
print('Query returned %d hits.' % (q.hits))
예제 #2
0
# -*- coding: utf-8 -*-

from SeaCOW import Query, ConcordanceLoader
import json  # Just for pretty-printing.

# See sample.py for annotations of these attributes.
q = Query()
q.corpus = 'decow16a-nano'
q.string = '[word="Gartenzwerg"]'
q.max_hits = 10
q.attributes = ['word', 'tag']
q.structures = ['s']
q.references = [
    'doc.url', 'doc.bdc', 'doc.tld', 'doc.id', 'div.bpc', 's.idx', 's.type'
]
q.container = 's'
q.set_deduplication()

# The concordance loader has just one settable attribute.
p = ConcordanceLoader()
p.full_structure = True  # Convert token attributes to dicts as well, otherwise |-separated.
q.processor = p
q.run()

# Now you have a nice structured Python object in p.concordance.

# The json library just provides a convenient way of displaying the
# resulting structures.
print json.dumps(p.concordance[0:2], sort_keys=False, indent=2)
예제 #3
0
# -*- coding: utf-8 -*-

from SeaCOW import Query, ConcordanceWriter, DependencyBuilder

# Create a Query object and set whatever needs to be set.
q = Query()
q.corpus = 'decow16a-nano'  # Lower-case name of the corpusto use.
q.string = '[word="Motorsäge"]'  # A normal CQL string as used in NoSketchEngine.
q.max_hits = 10  # Maximal number of hits to return. Use when testing queries!
q.structures = ['s']  # Structure markup to export from corpus.
q.references = [
    'doc.url', 'doc.bdc', 'doc.tld', 'doc.id', 'div.bpc', 's.idx', 's.type'
]
# Which reference attributes (of structures) to export.
q.container = 's'  # Which container strutcure should be exported?
q.set_deduplication()  # Use deduplication.

# The dependency attributes are needed for DependencyBuilder() to work.
q.attributes = ['word', 'depind', 'dephd', 'deprel', 'tag', 'lemma']

# The dependency builder reconstructs (and outputs) dependency trees.
# If you want to filter structures, create a class which inherits from
# DependencyBuilder and override the filtre() method (NO TYPO there).
p = DependencyBuilder()

# The following five are 0-based indeces into q.attributes as defined above.
p.column_token = 0  # Which column contains the token?
p.column_index = 1  # Which column contains the dependency index?
p.column_head = 2  # Which column contains the dependency head index?
p.column_relation = 3  # Which column contains the dependency relation?
p.attribs = [4,
예제 #4
0
파일: iterate.py 프로젝트: rsling/seacow
# -*- coding: utf-8 -*-

from SeaCOW import Query, ConcordanceWriter, DependencyBuilder
from iterated import IterativelyFiltredDependencyBuilder

# FIRST QUERY.

# Create a Query object and set whatever needs to be set.
q = Query()
q.corpus = 'encow16a-nano'
q.string = '[word="give" & tag="VB[ZPD]"]'
q.max_hits = 100
q.attributes = ['word', 'depind', 'dephd', 'deprel', 'tag', 'lemma']
q.structures = ['s']
q.references = [
    'doc.url', 'doc.bdc', 'doc.tld', 'doc.id', 'div.bpc', 's.idx', 's.type'
]
q.container = 's'

# This enables an efficient duplicate remover using a scaling Bloom filter.
q.set_deduplication()

p = IterativelyFiltredDependencyBuilder()
p.column_token = 0
p.column_index = 1
p.column_head = 2
p.column_relation = 3
p.attribs = [4, 5]
p.fileprefix = 'give_iterated'
p.savejson = True
p.saveimage = 'png'
예제 #5
0
# -*- coding: utf-8 -*-

# This dumps a very raw concordance format.
# It's very efficient, though.
# See samply.py for options.

from SeaCOW import Query, ConcordanceDumper

q = Query()
q.corpus          = 'decow16b'
q.string          = '[word="Holzweg"]'
q.max_hits        = 10
q.attributes      = ['word']
q.structures      = ['s']
q.references      = ['doc.url', 'doc.id', 's.idx']
q.container       = 's'
q.set_deduplication()

p                 = ConcordanceDumper()
p.filename        = 'output/holzweg.txt'
q.processor       = p
q.run()
예제 #6
0
# -*- coding: utf-8 -*-

import random
from SeaCOW import Query, ConcordanceWriter, DependencyBuilder

random.seed(2914)

q = Query()
q.corpus          = 'precox20lda25'
q.string          = '<doc id="[0-9a-f].+">'
q.random_subset   =  0.09
q.attributes      = ['word']
q.structures      = ['s.idx', 'div.bpc', 'doc.bdc', 'doc.url', 'doc.id', 'doc.pregister', 'doc.pregbrob']
q.references      = ['doc.url', 'doc.id']
q.container       = 'doc'

p                 = ConcordanceWriter()
p.filename        = 'sample.csv'
q.processor       = p
q.run()

예제 #7
0
# -*- coding: utf-8 -*-

# This dumps a very raw concordance format.
# It's very efficient, though.
# See samply.py for options.

from SeaCOW import Query, ConcordanceDumper

q = Query()
q.corpus = 'encow16a-nano'
q.string = '[tag="N."][word="attention"]'
q.max_hits = -1
q.attributes = ['word']
q.structures = ['s']
q.references = ['doc.url', 'doc.id', 's.idx']
q.container = 's'
q.set_deduplication()

p = ConcordanceDumper()
p.filename = 'dump.txt'
q.processor = p
q.run()